Example usage for org.apache.commons.math3.random Well1024a nextLong

Introduction

In this page you can find the example usage for org.apache.commons.math3.random Well1024a nextLong.

Prototype

public long nextLong()

Source Link

Usage

From source file:com.ibm.bi.dml.runtime.matrix.data.LibMatrixDatagen.java

/**
 * /*  w  ww  .  ja  va 2  s.com*/
 * @param bigrand
 * @param nrb
 * @param ncb
 * @return
 */
private static long[] generateSeedsForCP(Well1024a bigrand, int nrb, int ncb) {
    int numBlocks = nrb * ncb;
    long[] seeds = new long[numBlocks];
    for (int l = 0; l < numBlocks; l++) {
        // case of CP: generate a block-level seed from matrix-level Well1024a seed
        seeds[l] = bigrand.nextLong();
    }

    return seeds;
}

From source file:com.ibm.bi.dml.runtime.matrix.DataGenMR.java

/**
 * <p>Starts a Rand MapReduce job which will produce one or more random objects.</p>
 * //  w  ww .ja  va  2  s  .  c  om
 * @param numRows number of rows for each random object
 * @param numCols number of columns for each random object
 * @param blockRowSize number of rows in a block for each random object
 * @param blockColSize number of columns in a block for each random object
 * @param minValue minimum of the random values for each random object
 * @param maxValue maximum of the random values for each random object
 * @param sparsity sparsity for each random object
 * @param pdf probability density function for each random object
 * @param replication file replication
 * @param inputs input file for each random object
 * @param outputs output file for each random object
 * @param outputInfos output information for each random object
 * @param instructionsInMapper instruction for each random object
 * @param resultIndexes result indexes for each random object
 * @return matrix characteristics for each random object
 * @throws Exception if an error occurred in the MapReduce phase
 */

public static JobReturn runJob(MRJobInstruction inst, String[] dataGenInstructions, String instructionsInMapper,
        String aggInstructionsInReducer, String otherInstructionsInReducer, int numReducers, int replication,
        byte[] resultIndexes, String dimsUnknownFilePrefix, String[] outputs, OutputInfo[] outputInfos)
        throws Exception {
    JobConf job = new JobConf(DataGenMR.class);
    job.setJobName("DataGen-MR");

    //whether use block representation or cell representation
    MRJobConfiguration.setMatrixValueClass(job, true);

    byte[] realIndexes = new byte[dataGenInstructions.length];
    for (byte b = 0; b < realIndexes.length; b++)
        realIndexes[b] = b;

    String[] inputs = new String[dataGenInstructions.length];
    InputInfo[] inputInfos = new InputInfo[dataGenInstructions.length];
    long[] rlens = new long[dataGenInstructions.length];
    long[] clens = new long[dataGenInstructions.length];
    int[] brlens = new int[dataGenInstructions.length];
    int[] bclens = new int[dataGenInstructions.length];

    FileSystem fs = FileSystem.get(job);
    String dataGenInsStr = "";
    int numblocks = 0;
    int maxbrlen = -1, maxbclen = -1;
    double maxsparsity = -1;

    for (int i = 0; i < dataGenInstructions.length; i++) {
        dataGenInsStr = dataGenInsStr + Lop.INSTRUCTION_DELIMITOR + dataGenInstructions[i];

        MRInstruction mrins = MRInstructionParser.parseSingleInstruction(dataGenInstructions[i]);
        MRINSTRUCTION_TYPE mrtype = mrins.getMRInstructionType();
        DataGenMRInstruction genInst = (DataGenMRInstruction) mrins;

        rlens[i] = genInst.getRows();
        clens[i] = genInst.getCols();
        brlens[i] = genInst.getRowsInBlock();
        bclens[i] = genInst.getColsInBlock();

        maxbrlen = Math.max(maxbrlen, brlens[i]);
        maxbclen = Math.max(maxbclen, bclens[i]);

        if (mrtype == MRINSTRUCTION_TYPE.Rand) {
            RandInstruction randInst = (RandInstruction) mrins;
            inputs[i] = genInst.getBaseDir() + "tmp" + _seqRandInput.getNextID() + ".randinput";
            maxsparsity = Math.max(maxsparsity, randInst.getSparsity());

            FSDataOutputStream fsOut = fs.create(new Path(inputs[i]));
            PrintWriter pw = new PrintWriter(fsOut);

            //for obj reuse and preventing repeated buffer re-allocations
            StringBuilder sb = new StringBuilder();

            //seed generation
            Well1024a bigrand = LibMatrixDatagen.setupSeedsForRand(randInst.getSeed());
            long[] nnz = LibMatrixDatagen.computeNNZperBlock(rlens[i], clens[i], brlens[i], bclens[i],
                    randInst.getSparsity());
            int nnzIx = 0;
            for (long r = 0; r < rlens[i]; r += brlens[i]) {
                long curBlockRowSize = Math.min(brlens[i], (rlens[i] - r));
                for (long c = 0; c < clens[i]; c += bclens[i]) {
                    long curBlockColSize = Math.min(bclens[i], (clens[i] - c));

                    sb.append((r / brlens[i]) + 1);
                    sb.append(',');
                    sb.append((c / bclens[i]) + 1);
                    sb.append(',');
                    sb.append(curBlockRowSize);
                    sb.append(',');
                    sb.append(curBlockColSize);
                    sb.append(',');
                    sb.append(nnz[nnzIx++]);
                    sb.append(',');
                    sb.append(bigrand.nextLong());
                    pw.println(sb.toString());
                    sb.setLength(0);
                    numblocks++;
                }
            }
            pw.close();
            fsOut.close();
            inputInfos[i] = InputInfo.TextCellInputInfo;
        } else if (mrtype == MRINSTRUCTION_TYPE.Seq) {
            SeqInstruction seqInst = (SeqInstruction) mrins;
            inputs[i] = genInst.getBaseDir() + System.currentTimeMillis() + ".seqinput";
            maxsparsity = 1.0; //always dense

            double from = seqInst.fromValue;
            double to = seqInst.toValue;
            double incr = seqInst.incrValue;

            // Correctness checks on (from, to, incr)
            boolean neg = (from > to);
            if (incr == 0)
                throw new DMLRuntimeException("Invalid value for \"increment\" in seq().");

            if (neg != (incr < 0))
                throw new DMLRuntimeException("Wrong sign for the increment in a call to seq()");

            // Compute the number of rows in the sequence
            long numrows = 1 + (long) Math.floor((to - from) / incr);
            if (rlens[i] > 0) {
                if (numrows != rlens[i])
                    throw new DMLRuntimeException(
                            "Unexpected error while processing sequence instruction. Expected number of rows does not match given number: "
                                    + rlens[i] + " != " + numrows);
            } else {
                rlens[i] = numrows;
            }

            if (clens[i] > 0 && clens[i] != 1)
                throw new DMLRuntimeException(
                        "Unexpected error while processing sequence instruction. Number of columns (" + clens[i]
                                + ") must be equal to 1.");
            else
                clens[i] = 1;

            FSDataOutputStream fsOut = fs.create(new Path(inputs[i]));
            PrintWriter pw = new PrintWriter(fsOut);
            StringBuilder sb = new StringBuilder();

            double temp = from;
            double block_from, block_to;
            for (long r = 0; r < rlens[i]; r += brlens[i]) {
                long curBlockRowSize = Math.min(brlens[i], (rlens[i] - r));

                // block (bid_i,bid_j) generates a sequence from the interval [block_from, block_to] (inclusive of both end points of the interval) 
                long bid_i = ((r / brlens[i]) + 1);
                long bid_j = 1;
                block_from = temp;
                block_to = temp + (curBlockRowSize - 1) * incr;
                temp = block_to + incr; // next block starts from here

                sb.append(bid_i);
                sb.append(',');
                sb.append(bid_j);
                sb.append(',');
                /*
                // Need not include block size while generating seq()
                sb.append(curBlockRowSize);
                sb.append(',');
                sb.append(1);
                sb.append(',');*/
                sb.append(block_from);
                sb.append(',');
                sb.append(block_to);
                sb.append(',');
                sb.append(incr);

                pw.println(sb.toString());
                //System.out.println("MapTask " + r + ": " + sb.toString());
                sb.setLength(0);
                numblocks++;
            }

            pw.close();
            fsOut.close();
            inputInfos[i] = InputInfo.TextCellInputInfo;
        } else {
            throw new DMLRuntimeException("Unexpected Data Generation Instruction Type: " + mrtype);
        }
    }
    dataGenInsStr = dataGenInsStr.substring(1);//remove the first ","
    RunningJob runjob;
    MatrixCharacteristics[] stats;
    try {
        //set up the block size
        MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens);

        //set up the input files and their format information
        MRJobConfiguration.setUpMultipleInputs(job, realIndexes, inputs, inputInfos, brlens, bclens, false,
                ConvertTarget.BLOCK);

        //set up the dimensions of input matrices
        MRJobConfiguration.setMatricesDimensions(job, realIndexes, rlens, clens);
        MRJobConfiguration.setDimsUnknownFilePrefix(job, dimsUnknownFilePrefix);

        //set up the block size
        MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens);

        //set up the rand Instructions
        MRJobConfiguration.setRandInstructions(job, dataGenInsStr);

        //set up unary instructions that will perform in the mapper
        MRJobConfiguration.setInstructionsInMapper(job, instructionsInMapper);

        //set up the aggregate instructions that will happen in the combiner and reducer
        MRJobConfiguration.setAggregateInstructions(job, aggInstructionsInReducer);

        //set up the instructions that will happen in the reducer, after the aggregation instrucions
        MRJobConfiguration.setInstructionsInReducer(job, otherInstructionsInReducer);

        //set up the replication factor for the results
        job.setInt("dfs.replication", replication);

        //set up map/reduce memory configurations (if in AM context)
        DMLConfig config = ConfigurationManager.getConfig();
        DMLAppMasterUtils.setupMRJobRemoteMaxMemory(job, config);

        //determine degree of parallelism (nmappers: 1<=n<=capacity)
        //TODO use maxsparsity whenever we have a way of generating sparse rand data
        int capacity = InfrastructureAnalyzer.getRemoteParallelMapTasks();
        long dfsblocksize = InfrastructureAnalyzer.getHDFSBlockSize();
        //correction max number of mappers on yarn clusters
        if (InfrastructureAnalyzer.isYarnEnabled())
            capacity = (int) Math.max(capacity, YarnClusterAnalyzer.getNumCores());
        int nmapers = Math
                .max(Math.min((int) (8 * maxbrlen * maxbclen * (long) numblocks / dfsblocksize), capacity), 1);
        job.setNumMapTasks(nmapers);

        //set up what matrices are needed to pass from the mapper to reducer
        HashSet<Byte> mapoutputIndexes = MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes,
                dataGenInsStr, instructionsInMapper, null, aggInstructionsInReducer, otherInstructionsInReducer,
                resultIndexes);

        MatrixChar_N_ReducerGroups ret = MRJobConfiguration.computeMatrixCharacteristics(job, realIndexes,
                dataGenInsStr, instructionsInMapper, null, aggInstructionsInReducer, null,
                otherInstructionsInReducer, resultIndexes, mapoutputIndexes, false);
        stats = ret.stats;

        //set up the number of reducers
        MRJobConfiguration.setNumReducers(job, ret.numReducerGroups, numReducers);

        // print the complete MRJob instruction
        if (LOG.isTraceEnabled())
            inst.printCompleteMRJobInstruction(stats);

        // Update resultDimsUnknown based on computed "stats"
        byte[] resultDimsUnknown = new byte[resultIndexes.length];
        for (int i = 0; i < resultIndexes.length; i++) {
            if (stats[i].getRows() == -1 || stats[i].getCols() == -1) {
                resultDimsUnknown[i] = (byte) 1;
            } else {
                resultDimsUnknown[i] = (byte) 0;
            }
        }

        boolean mayContainCtable = instructionsInMapper.contains("ctabletransform")
                || instructionsInMapper.contains("groupedagg");

        //set up the multiple output files, and their format information
        MRJobConfiguration.setUpMultipleOutputs(job, resultIndexes, resultDimsUnknown, outputs, outputInfos,
                true, mayContainCtable);

        // configure mapper and the mapper output key value pairs
        job.setMapperClass(DataGenMapper.class);
        if (numReducers == 0) {
            job.setMapOutputKeyClass(Writable.class);
            job.setMapOutputValueClass(Writable.class);
        } else {
            job.setMapOutputKeyClass(MatrixIndexes.class);
            job.setMapOutputValueClass(TaggedMatrixBlock.class);
        }

        //set up combiner
        if (numReducers != 0 && aggInstructionsInReducer != null && !aggInstructionsInReducer.isEmpty())
            job.setCombinerClass(GMRCombiner.class);

        //configure reducer
        job.setReducerClass(GMRReducer.class);
        //job.setReducerClass(PassThroughReducer.class);

        // By default, the job executes in "cluster" mode.
        // Determine if we can optimize and run it in "local" mode.
        MatrixCharacteristics[] inputStats = new MatrixCharacteristics[inputs.length];
        for (int i = 0; i < inputs.length; i++) {
            inputStats[i] = new MatrixCharacteristics(rlens[i], clens[i], brlens[i], bclens[i]);
        }

        //set unique working dir
        MRJobConfiguration.setUniqueWorkingDir(job);

        runjob = JobClient.runJob(job);

        /* Process different counters */

        Group group = runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS);
        for (int i = 0; i < resultIndexes.length; i++) {
            // number of non-zeros
            stats[i].setNonZeros(group.getCounter(Integer.toString(i)));
        }

        String dir = dimsUnknownFilePrefix + "/" + runjob.getID().toString() + "_dimsFile";
        stats = MapReduceTool.processDimsFiles(dir, stats);
        MapReduceTool.deleteFileIfExistOnHDFS(dir);

    } finally {
        for (String input : inputs)
            MapReduceTool.deleteFileIfExistOnHDFS(new Path(input), job);
    }

    return new JobReturn(stats, outputInfos, runjob.isSuccessful());
}

From source file:com.ibm.bi.dml.runtime.instructions.spark.RandSPInstruction.java

/**
 * Helper function to construct a sample.
 * //w  w w  .j  a v a2 s  .co  m
 * @param sec
 * @throws DMLRuntimeException
 */
private void generateSample(SparkExecutionContext sec) throws DMLRuntimeException {
    if (maxValue < rows && !replace)
        throw new DMLRuntimeException("Sample (size=" + rows + ") larger than population (size=" + maxValue
                + ") can only be generated with replacement.");

    if (LOG.isTraceEnabled())
        LOG.trace("Process RandSPInstruction sample with range=" + maxValue + ", size=" + rows + ", replace="
                + replace + ", seed=" + seed);

    // sampling rate that guarantees a sample of size >= sampleSizeLowerBound 99.99% of the time.
    double fraction = SamplingUtils.computeFractionForSampleSize((int) rows, UtilFunctions.toLong(maxValue),
            replace);

    Well1024a bigrand = LibMatrixDatagen.setupSeedsForRand(seed);

    // divide the population range across numPartitions by creating SampleTasks
    double hdfsBlockSize = InfrastructureAnalyzer.getHDFSBlockSize();
    long outputSize = MatrixBlock.estimateSizeDenseInMemory(rows, 1);
    int numPartitions = (int) Math.ceil((double) outputSize / hdfsBlockSize);
    long partitionSize = (long) Math.ceil(maxValue / numPartitions);

    ArrayList<SampleTask> offsets = new ArrayList<SampleTask>();
    long st = 1;
    while (st <= maxValue) {
        SampleTask s = new SampleTask();
        s.range_start = st;
        s.seed = bigrand.nextLong();

        offsets.add(s);
        st = st + partitionSize;
    }
    JavaRDD<SampleTask> offsetRDD = sec.getSparkContext().parallelize(offsets, numPartitions);

    // Construct the sample in a distributed manner
    JavaRDD<Double> rdd = offsetRDD
            .flatMap((new GenerateSampleBlock(replace, fraction, (long) maxValue, partitionSize)));

    // Randomize the sampled elements
    JavaRDD<Double> randomizedRDD = rdd.mapToPair(new AttachRandom()).sortByKey().values();

    // Trim the sampled list to required size & attach matrix indexes to randomized elements
    JavaPairRDD<MatrixIndexes, MatrixCell> miRDD = randomizedRDD.zipWithIndex().filter(new TrimSample(rows))
            .mapToPair(new Double2MatrixCell());

    MatrixCharacteristics mcOut = new MatrixCharacteristics(rows, 1, rowsInBlock, colsInBlock, rows);

    // Construct BinaryBlock representation
    JavaPairRDD<MatrixIndexes, MatrixBlock> mbRDD = RDDConverterUtils
            .binaryCellToBinaryBlock(sec.getSparkContext(), miRDD, mcOut, true);

    MatrixCharacteristics retDims = sec.getMatrixCharacteristics(output.getName());
    retDims.setNonZeros(rows);

    sec.setRDDHandleForVariable(output.getName(), mbRDD);
}

From source file:com.ibm.bi.dml.runtime.instructions.spark.RandSPInstruction.java

@Override
public void processInstruction(ExecutionContext ec) throws DMLRuntimeException {
    SparkExecutionContext sec = (SparkExecutionContext) ec;

    //process specific datagen operator
    if (this.method == DataGenMethod.RAND) {
        // The implementation is in same spirit as MapReduce
        // We generate seeds similar to com.ibm.bi.dml.runtime.matrix.DataGenMR
        // and then generate blocks similar to com.ibm.bi.dml.runtime.matrix.mapred.DataGenMapper

        //generate pseudo-random seed (because not specified) 
        long lSeed = seed; //seed per invocation
        if (lSeed == DataGenOp.UNSPECIFIED_SEED)
            lSeed = DataGenOp.generateRandomSeed();

        if (LOG.isTraceEnabled())
            LOG.trace("Process RandSPInstruction rand with seed = " + lSeed + ".");

        //Check if there is sufficient memory for matrix to be created and execution platform is not forced Spark
        if (isMemAvail(rows, cols, sparsity, minValue, maxValue)
                && DMLScript.rtplatform != RUNTIME_PLATFORM.SPARK) {
            RandomMatrixGenerator rgen = LibMatrixDatagen.createRandomMatrixGenerator(pdf, (int) rows,
                    (int) cols, rowsInBlock, colsInBlock, sparsity, minValue, maxValue, pdfParams);
            MatrixBlock mb = MatrixBlock.randOperations(rgen, lSeed);

            sec.setMatrixOutput(output.getName(), mb);
            Statistics.decrementNoOfExecutedSPInst();
            return;
        }/*  www. j a va  2 s .  c o  m*/

        // seed generation (partitioned to bound memory requirements)
        JavaPairRDD<MatrixIndexes, Tuple2<Long, Long>> seedsRDD = null;
        Well1024a bigrand = LibMatrixDatagen.setupSeedsForRand(lSeed);
        long[] nnz = LibMatrixDatagen.computeNNZperBlock(rows, cols, rowsInBlock, colsInBlock, sparsity);
        double hdfsBlockSize = InfrastructureAnalyzer.getHDFSBlockSize();
        long numBlocks = nnz.length;
        long numColBlocks = (long) Math.ceil((double) cols / (double) colsInBlock);

        for (long p = 0; p < numBlocks; p += SEED_PARTITION_SIZE) {
            ArrayList<Tuple2<MatrixIndexes, Tuple2<Long, Long>>> seeds = new ArrayList<Tuple2<MatrixIndexes, Tuple2<Long, Long>>>();
            double partitionSize = 0;
            for (long i = p; i < Math.min(p + SEED_PARTITION_SIZE, numBlocks); i++) {
                long r = 1 + i / numColBlocks;
                long c = 1 + i % numColBlocks;
                MatrixIndexes indx = new MatrixIndexes(r, c);
                Long seedForBlock = bigrand.nextLong();
                seeds.add(new Tuple2<MatrixIndexes, Tuple2<Long, Long>>(indx,
                        new Tuple2<Long, Long>(seedForBlock, nnz[(int) i])));
                partitionSize += nnz[(int) i] * 8 + 16;
            }

            //for load balancing: degree of parallelism such that ~128MB per partition
            int numPartitions = (int) Math.max(Math.min(partitionSize / hdfsBlockSize, seeds.size()), 1);

            //combine seeds partitions to seed rdd
            JavaPairRDD<MatrixIndexes, Tuple2<Long, Long>> seedsRDD2 = JavaPairRDD
                    .fromJavaRDD(sec.getSparkContext().parallelize(seeds, numPartitions));
            seedsRDD = (seedsRDD != null) ? seedsRDD.union(seedsRDD2) : seedsRDD2;
        }

        //execute rand instruction over seed input
        JavaPairRDD<MatrixIndexes, MatrixBlock> out = seedsRDD.mapToPair(new GenerateRandomBlock(rows, cols,
                rowsInBlock, colsInBlock, sparsity, minValue, maxValue, pdf, pdfParams));

        //output handling
        MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(output.getName());
        if (!mcOut.dimsKnown(true)) {
            //note: we cannot compute the nnz from sparsity because this would not reflect the 
            //actual number of non-zeros, except for extreme values of sparsity equals 0 or 1.
            long lnnz = (sparsity == 0 || sparsity == 1) ? (long) (sparsity * rows * cols) : -1;
            mcOut.set(rows, cols, rowsInBlock, colsInBlock, lnnz);
        }
        sec.setRDDHandleForVariable(output.getName(), out);
    } else if (this.method == DataGenMethod.SEQ) {
        //sanity check valid increment
        if (seq_incr == 0) {
            throw new DMLRuntimeException(
                    "ERROR: While performing seq(" + seq_from + "," + seq_to + "," + seq_incr + ")");
        }

        if (LOG.isTraceEnabled())
            LOG.trace("Process RandSPInstruction seq with seqFrom=" + seq_from + ", seqTo=" + seq_to
                    + ", seqIncr" + seq_incr);

        // offset generation (partitioned to bound memory requirements)
        JavaRDD<Double> offsetsRDD = null;
        double hdfsBlockSize = InfrastructureAnalyzer.getHDFSBlockSize();
        long nnz = (long) Math.abs(Math.round((seq_to - seq_from) / seq_incr)) + 1;
        long numBlocks = (long) Math.ceil(((double) nnz) / rowsInBlock);

        for (long p = 0; p < numBlocks; p += SEED_PARTITION_SIZE) {
            ArrayList<Double> offsets = new ArrayList<Double>();
            double partitionSize = 0;
            for (long i = p; i < Math.min(p + SEED_PARTITION_SIZE, numBlocks); i++) {
                double off = seq_from + seq_incr * i * rowsInBlock;
                offsets.add(off);
                partitionSize += rowsInBlock * 8 + 16;
            }

            //for load balancing: degree of parallelism such that ~128MB per partition
            int numPartitions = (int) Math.max(Math.min(partitionSize / hdfsBlockSize, offsets.size()), 1);

            //combine seeds partitions to seed rdd
            JavaRDD<Double> offsetsRDD2 = sec.getSparkContext().parallelize(offsets, numPartitions);

            offsetsRDD = (offsetsRDD != null) ? offsetsRDD.union(offsetsRDD2) : offsetsRDD2;
        }

        //sanity check number of non-zeros
        if (nnz != rows && rows != -1) {
            throw new DMLRuntimeException("Incorrect number of non-zeros: " + nnz + " != " + rows);
        }

        //execute seq instruction over offset input
        JavaPairRDD<MatrixIndexes, MatrixBlock> out = offsetsRDD
                .mapToPair(new GenerateSequenceBlock(rowsInBlock, seq_from, seq_to, seq_incr));

        //output handling
        MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(output.getName());
        if (!mcOut.dimsKnown()) {
            mcOut.set(nnz, 1, rowsInBlock, colsInBlock, nnz);
        }
        sec.setRDDHandleForVariable(output.getName(), out);
    } else if (this.method == DataGenMethod.SAMPLE) {
        generateSample(sec);
    }
}

From source file:org.apache.sysml.runtime.instructions.spark.RandSPInstruction.java

private void generateRandData(SparkExecutionContext sec) throws DMLRuntimeException {
    //step 1: generate pseudo-random seed (because not specified) 
    long lSeed = seed; //seed per invocation
    if (lSeed == DataGenOp.UNSPECIFIED_SEED)
        lSeed = DataGenOp.generateRandomSeed();

    if (LOG.isTraceEnabled())
        LOG.trace("Process RandSPInstruction rand with seed = " + lSeed + ".");

    //step 2: potential in-memory rand operations if applicable
    if (isMemAvail(rows, cols, sparsity, minValue, maxValue)
            && DMLScript.rtplatform != RUNTIME_PLATFORM.SPARK) {
        RandomMatrixGenerator rgen = LibMatrixDatagen.createRandomMatrixGenerator(pdf, (int) rows, (int) cols,
                rowsInBlock, colsInBlock, sparsity, minValue, maxValue, pdfParams);
        MatrixBlock mb = MatrixBlock.randOperations(rgen, lSeed);

        sec.setMatrixOutput(output.getName(), mb);
        Statistics.decrementNoOfExecutedSPInst();
        return;/*  ww  w .java  2s.  c  o m*/
    }

    //step 3: seed generation 
    JavaPairRDD<MatrixIndexes, Tuple2<Long, Long>> seedsRDD = null;
    Well1024a bigrand = LibMatrixDatagen.setupSeedsForRand(lSeed);
    long[] nnz = LibMatrixDatagen.computeNNZperBlock(rows, cols, rowsInBlock, colsInBlock, sparsity);
    double totalSize = OptimizerUtils.estimatePartitionedSizeExactSparsity(rows, cols, rowsInBlock, colsInBlock,
            rows * cols * sparsity); //overestimate for on disk, ensures hdfs block per partition
    double hdfsBlkSize = InfrastructureAnalyzer.getHDFSBlockSize();
    long numBlocks = nnz.length;
    long numColBlocks = (long) Math.ceil((double) cols / (double) colsInBlock);

    //a) in-memory seed rdd construction 
    if (numBlocks < INMEMORY_NUMBLOCKS_THRESHOLD) {
        ArrayList<Tuple2<MatrixIndexes, Tuple2<Long, Long>>> seeds = new ArrayList<Tuple2<MatrixIndexes, Tuple2<Long, Long>>>();
        for (long i = 0; i < numBlocks; i++) {
            long r = 1 + i / numColBlocks;
            long c = 1 + i % numColBlocks;
            MatrixIndexes indx = new MatrixIndexes(r, c);
            Long seedForBlock = bigrand.nextLong();
            seeds.add(new Tuple2<MatrixIndexes, Tuple2<Long, Long>>(indx,
                    new Tuple2<Long, Long>(seedForBlock, nnz[(int) i])));
        }

        //for load balancing: degree of parallelism such that ~128MB per partition
        int numPartitions = (int) Math.max(Math.min(totalSize / hdfsBlkSize, numBlocks), 1);

        //create seeds rdd 
        seedsRDD = sec.getSparkContext().parallelizePairs(seeds, numPartitions);
    }
    //b) file-based seed rdd construction (for robustness wrt large number of blocks)
    else {
        String path = LibMatrixDatagen.generateUniqueSeedPath(dir);

        try {
            FileSystem fs = FileSystem.get(ConfigurationManager.getCachedJobConf());
            FSDataOutputStream fsOut = fs.create(new Path(path));
            PrintWriter pw = new PrintWriter(fsOut);
            StringBuilder sb = new StringBuilder();
            for (long i = 0; i < numBlocks; i++) {
                sb.append(1 + i / numColBlocks);
                sb.append(',');
                sb.append(1 + i % numColBlocks);
                sb.append(',');
                sb.append(bigrand.nextLong());
                sb.append(',');
                sb.append(nnz[(int) i]);
                pw.println(sb.toString());
                sb.setLength(0);
            }
            pw.close();
            fsOut.close();
        } catch (IOException ex) {
            throw new DMLRuntimeException(ex);
        }

        //for load balancing: degree of parallelism such that ~128MB per partition
        int numPartitions = (int) Math.max(Math.min(totalSize / hdfsBlkSize, numBlocks), 1);

        //create seeds rdd 
        seedsRDD = sec.getSparkContext().textFile(path, numPartitions).mapToPair(new ExtractSeedTuple());
    }

    //step 4: execute rand instruction over seed input
    JavaPairRDD<MatrixIndexes, MatrixBlock> out = seedsRDD.mapToPair(new GenerateRandomBlock(rows, cols,
            rowsInBlock, colsInBlock, sparsity, minValue, maxValue, pdf, pdfParams));

    //step 5: output handling
    MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(output.getName());
    if (!mcOut.dimsKnown(true)) {
        //note: we cannot compute the nnz from sparsity because this would not reflect the 
        //actual number of non-zeros, except for extreme values of sparsity equals 0 or 1.
        long lnnz = (sparsity == 0 || sparsity == 1) ? (long) (sparsity * rows * cols) : -1;
        mcOut.set(rows, cols, rowsInBlock, colsInBlock, lnnz);
    }
    sec.setRDDHandleForVariable(output.getName(), out);
}

From source file:org.apache.sysml.runtime.matrix.data.LibMatrixDatagen.java

private static long[] generateSeedsForCP(Well1024a bigrand, int nrb, int ncb) {
    int numBlocks = nrb * ncb;
    long[] seeds = new long[numBlocks];
    for (int l = 0; l < numBlocks; l++)
        seeds[l] = bigrand.nextLong();

    return seeds;
}

From source file:org.apache.sysml.runtime.matrix.DataGenMR.java

/**
 * <p>Starts a Rand MapReduce job which will produce one or more random objects.</p>
 * /*from   www  .j  a va 2s  .c  o  m*/
 * @param inst MR job instruction
 * @param dataGenInstructions array of data gen instructions
 * @param instructionsInMapper instructions in mapper
 * @param aggInstructionsInReducer aggregate instructions in reducer
 * @param otherInstructionsInReducer other instructions in reducer
 * @param numReducers number of reducers
 * @param replication file replication
 * @param resultIndexes result indexes for each random object
 * @param dimsUnknownFilePrefix file path prefix when dimensions unknown
 * @param outputs output file for each random object
 * @param outputInfos output information for each random object
 * @return matrix characteristics for each random object
 * @throws Exception if Exception occurs
 */
public static JobReturn runJob(MRJobInstruction inst, String[] dataGenInstructions, String instructionsInMapper,
        String aggInstructionsInReducer, String otherInstructionsInReducer, int numReducers, int replication,
        byte[] resultIndexes, String dimsUnknownFilePrefix, String[] outputs, OutputInfo[] outputInfos)
        throws Exception {
    JobConf job = new JobConf(DataGenMR.class);
    job.setJobName("DataGen-MR");

    //whether use block representation or cell representation
    MRJobConfiguration.setMatrixValueClass(job, true);

    byte[] realIndexes = new byte[dataGenInstructions.length];
    for (byte b = 0; b < realIndexes.length; b++)
        realIndexes[b] = b;

    String[] inputs = new String[dataGenInstructions.length];
    InputInfo[] inputInfos = new InputInfo[dataGenInstructions.length];
    long[] rlens = new long[dataGenInstructions.length];
    long[] clens = new long[dataGenInstructions.length];
    int[] brlens = new int[dataGenInstructions.length];
    int[] bclens = new int[dataGenInstructions.length];

    FileSystem fs = FileSystem.get(job);
    String dataGenInsStr = "";
    int numblocks = 0;
    int maxbrlen = -1, maxbclen = -1;
    double maxsparsity = -1;

    for (int i = 0; i < dataGenInstructions.length; i++) {
        dataGenInsStr = dataGenInsStr + Lop.INSTRUCTION_DELIMITOR + dataGenInstructions[i];

        MRInstruction mrins = MRInstructionParser.parseSingleInstruction(dataGenInstructions[i]);
        MRINSTRUCTION_TYPE mrtype = mrins.getMRInstructionType();
        DataGenMRInstruction genInst = (DataGenMRInstruction) mrins;

        rlens[i] = genInst.getRows();
        clens[i] = genInst.getCols();
        brlens[i] = genInst.getRowsInBlock();
        bclens[i] = genInst.getColsInBlock();

        maxbrlen = Math.max(maxbrlen, brlens[i]);
        maxbclen = Math.max(maxbclen, bclens[i]);

        if (mrtype == MRINSTRUCTION_TYPE.Rand) {
            RandInstruction randInst = (RandInstruction) mrins;
            inputs[i] = LibMatrixDatagen.generateUniqueSeedPath(genInst.getBaseDir());
            maxsparsity = Math.max(maxsparsity, randInst.getSparsity());

            FSDataOutputStream fsOut = fs.create(new Path(inputs[i]));
            PrintWriter pw = new PrintWriter(fsOut);

            //for obj reuse and preventing repeated buffer re-allocations
            StringBuilder sb = new StringBuilder();

            //seed generation
            Well1024a bigrand = LibMatrixDatagen.setupSeedsForRand(randInst.getSeed());
            long[] nnz = LibMatrixDatagen.computeNNZperBlock(rlens[i], clens[i], brlens[i], bclens[i],
                    randInst.getSparsity());
            int nnzIx = 0;
            for (long r = 0; r < rlens[i]; r += brlens[i]) {
                long curBlockRowSize = Math.min(brlens[i], (rlens[i] - r));
                for (long c = 0; c < clens[i]; c += bclens[i]) {
                    long curBlockColSize = Math.min(bclens[i], (clens[i] - c));

                    sb.append((r / brlens[i]) + 1);
                    sb.append(',');
                    sb.append((c / bclens[i]) + 1);
                    sb.append(',');
                    sb.append(curBlockRowSize);
                    sb.append(',');
                    sb.append(curBlockColSize);
                    sb.append(',');
                    sb.append(nnz[nnzIx++]);
                    sb.append(',');
                    sb.append(bigrand.nextLong());
                    pw.println(sb.toString());
                    sb.setLength(0);
                    numblocks++;
                }
            }
            pw.close();
            fsOut.close();
            inputInfos[i] = InputInfo.TextCellInputInfo;
        } else if (mrtype == MRINSTRUCTION_TYPE.Seq) {
            SeqInstruction seqInst = (SeqInstruction) mrins;
            inputs[i] = genInst.getBaseDir() + System.currentTimeMillis() + ".seqinput";
            maxsparsity = 1.0; //always dense

            double from = seqInst.fromValue;
            double to = seqInst.toValue;
            double incr = seqInst.incrValue;

            //handle default 1 to -1 for special case of from>to
            incr = LibMatrixDatagen.updateSeqIncr(from, to, incr);

            // Correctness checks on (from, to, incr)
            boolean neg = (from > to);
            if (incr == 0)
                throw new DMLRuntimeException("Invalid value for \"increment\" in seq().");

            if (neg != (incr < 0))
                throw new DMLRuntimeException("Wrong sign for the increment in a call to seq()");

            // Compute the number of rows in the sequence
            long numrows = 1 + (long) Math.floor((to - from) / incr);
            if (rlens[i] > 0) {
                if (numrows != rlens[i])
                    throw new DMLRuntimeException(
                            "Unexpected error while processing sequence instruction. Expected number of rows does not match given number: "
                                    + rlens[i] + " != " + numrows);
            } else {
                rlens[i] = numrows;
            }

            if (clens[i] > 0 && clens[i] != 1)
                throw new DMLRuntimeException(
                        "Unexpected error while processing sequence instruction. Number of columns (" + clens[i]
                                + ") must be equal to 1.");
            else
                clens[i] = 1;

            FSDataOutputStream fsOut = fs.create(new Path(inputs[i]));
            PrintWriter pw = new PrintWriter(fsOut);
            StringBuilder sb = new StringBuilder();

            double temp = from;
            double block_from, block_to;
            for (long r = 0; r < rlens[i]; r += brlens[i]) {
                long curBlockRowSize = Math.min(brlens[i], (rlens[i] - r));

                // block (bid_i,bid_j) generates a sequence from the interval [block_from, block_to] (inclusive of both end points of the interval) 
                long bid_i = ((r / brlens[i]) + 1);
                long bid_j = 1;
                block_from = temp;
                block_to = temp + (curBlockRowSize - 1) * incr;
                temp = block_to + incr; // next block starts from here

                sb.append(bid_i);
                sb.append(',');
                sb.append(bid_j);
                sb.append(',');
                /*
                // Need not include block size while generating seq()
                sb.append(curBlockRowSize);
                sb.append(',');
                sb.append(1);
                sb.append(',');*/
                sb.append(block_from);
                sb.append(',');
                sb.append(block_to);
                sb.append(',');
                sb.append(incr);

                pw.println(sb.toString());
                //System.out.println("MapTask " + r + ": " + sb.toString());
                sb.setLength(0);
                numblocks++;
            }

            pw.close();
            fsOut.close();
            inputInfos[i] = InputInfo.TextCellInputInfo;
        } else {
            throw new DMLRuntimeException("Unexpected Data Generation Instruction Type: " + mrtype);
        }
    }
    dataGenInsStr = dataGenInsStr.substring(1);//remove the first ","
    RunningJob runjob;
    MatrixCharacteristics[] stats;
    try {
        //set up the block size
        MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens);

        //set up the input files and their format information
        MRJobConfiguration.setUpMultipleInputs(job, realIndexes, inputs, inputInfos, brlens, bclens, false,
                ConvertTarget.BLOCK);

        //set up the dimensions of input matrices
        MRJobConfiguration.setMatricesDimensions(job, realIndexes, rlens, clens);
        MRJobConfiguration.setDimsUnknownFilePrefix(job, dimsUnknownFilePrefix);

        //set up the block size
        MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens);

        //set up the rand Instructions
        MRJobConfiguration.setRandInstructions(job, dataGenInsStr);

        //set up unary instructions that will perform in the mapper
        MRJobConfiguration.setInstructionsInMapper(job, instructionsInMapper);

        //set up the aggregate instructions that will happen in the combiner and reducer
        MRJobConfiguration.setAggregateInstructions(job, aggInstructionsInReducer);

        //set up the instructions that will happen in the reducer, after the aggregation instrucions
        MRJobConfiguration.setInstructionsInReducer(job, otherInstructionsInReducer);

        //set up the replication factor for the results
        job.setInt(MRConfigurationNames.DFS_REPLICATION, replication);

        //set up map/reduce memory configurations (if in AM context)
        DMLConfig config = ConfigurationManager.getDMLConfig();
        DMLAppMasterUtils.setupMRJobRemoteMaxMemory(job, config);

        //set up custom map/reduce configurations 
        MRJobConfiguration.setupCustomMRConfigurations(job, config);

        //determine degree of parallelism (nmappers: 1<=n<=capacity)
        //TODO use maxsparsity whenever we have a way of generating sparse rand data
        int capacity = InfrastructureAnalyzer.getRemoteParallelMapTasks();
        long dfsblocksize = InfrastructureAnalyzer.getHDFSBlockSize();
        //correction max number of mappers on yarn clusters
        if (InfrastructureAnalyzer.isYarnEnabled())
            capacity = (int) Math.max(capacity, YarnClusterAnalyzer.getNumCores());
        int nmapers = Math
                .max(Math.min((int) (8 * maxbrlen * maxbclen * (long) numblocks / dfsblocksize), capacity), 1);
        job.setNumMapTasks(nmapers);

        //set up what matrices are needed to pass from the mapper to reducer
        HashSet<Byte> mapoutputIndexes = MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes,
                dataGenInsStr, instructionsInMapper, null, aggInstructionsInReducer, otherInstructionsInReducer,
                resultIndexes);

        MatrixChar_N_ReducerGroups ret = MRJobConfiguration.computeMatrixCharacteristics(job, realIndexes,
                dataGenInsStr, instructionsInMapper, null, aggInstructionsInReducer, null,
                otherInstructionsInReducer, resultIndexes, mapoutputIndexes, false);
        stats = ret.stats;

        //set up the number of reducers
        MRJobConfiguration.setNumReducers(job, ret.numReducerGroups, numReducers);

        // print the complete MRJob instruction
        if (LOG.isTraceEnabled())
            inst.printCompleteMRJobInstruction(stats);

        // Update resultDimsUnknown based on computed "stats"
        byte[] resultDimsUnknown = new byte[resultIndexes.length];
        for (int i = 0; i < resultIndexes.length; i++) {
            if (stats[i].getRows() == -1 || stats[i].getCols() == -1) {
                resultDimsUnknown[i] = (byte) 1;
            } else {
                resultDimsUnknown[i] = (byte) 0;
            }
        }

        boolean mayContainCtable = instructionsInMapper.contains("ctabletransform")
                || instructionsInMapper.contains("groupedagg");

        //set up the multiple output files, and their format information
        MRJobConfiguration.setUpMultipleOutputs(job, resultIndexes, resultDimsUnknown, outputs, outputInfos,
                true, mayContainCtable);

        // configure mapper and the mapper output key value pairs
        job.setMapperClass(DataGenMapper.class);
        if (numReducers == 0) {
            job.setMapOutputKeyClass(Writable.class);
            job.setMapOutputValueClass(Writable.class);
        } else {
            job.setMapOutputKeyClass(MatrixIndexes.class);
            job.setMapOutputValueClass(TaggedMatrixBlock.class);
        }

        //set up combiner
        if (numReducers != 0 && aggInstructionsInReducer != null && !aggInstructionsInReducer.isEmpty())
            job.setCombinerClass(GMRCombiner.class);

        //configure reducer
        job.setReducerClass(GMRReducer.class);
        //job.setReducerClass(PassThroughReducer.class);

        // By default, the job executes in "cluster" mode.
        // Determine if we can optimize and run it in "local" mode.
        MatrixCharacteristics[] inputStats = new MatrixCharacteristics[inputs.length];
        for (int i = 0; i < inputs.length; i++) {
            inputStats[i] = new MatrixCharacteristics(rlens[i], clens[i], brlens[i], bclens[i]);
        }

        //set unique working dir
        MRJobConfiguration.setUniqueWorkingDir(job);

        runjob = JobClient.runJob(job);

        /* Process different counters */

        Group group = runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS);
        for (int i = 0; i < resultIndexes.length; i++) {
            // number of non-zeros
            stats[i].setNonZeros(group.getCounter(Integer.toString(i)));
        }

        String dir = dimsUnknownFilePrefix + "/" + runjob.getID().toString() + "_dimsFile";
        stats = MapReduceTool.processDimsFiles(dir, stats);
        MapReduceTool.deleteFileIfExistOnHDFS(dir);

    } finally {
        for (String input : inputs)
            MapReduceTool.deleteFileIfExistOnHDFS(new Path(input), job);
    }

    return new JobReturn(stats, outputInfos, runjob.isSuccessful());
}