Example usage for org.apache.commons.math3.random Well1024a nextLong

List of usage examples for org.apache.commons.math3.random Well1024a nextLong

Introduction

In this page you can find the example usage for org.apache.commons.math3.random Well1024a nextLong.

Prototype

public long nextLong() 

Source Link

Usage

From source file:com.ibm.bi.dml.runtime.matrix.data.LibMatrixDatagen.java

/**
 * /*  w  ww  .  ja  va 2  s.com*/
 * @param bigrand
 * @param nrb
 * @param ncb
 * @return
 */
private static long[] generateSeedsForCP(Well1024a bigrand, int nrb, int ncb) {
    int numBlocks = nrb * ncb;
    long[] seeds = new long[numBlocks];
    for (int l = 0; l < numBlocks; l++) {
        // case of CP: generate a block-level seed from matrix-level Well1024a seed
        seeds[l] = bigrand.nextLong();
    }

    return seeds;
}

From source file:com.ibm.bi.dml.runtime.matrix.DataGenMR.java

/**
 * <p>Starts a Rand MapReduce job which will produce one or more random objects.</p>
 * //  w  ww .ja  va  2  s  .  c  om
 * @param numRows number of rows for each random object
 * @param numCols number of columns for each random object
 * @param blockRowSize number of rows in a block for each random object
 * @param blockColSize number of columns in a block for each random object
 * @param minValue minimum of the random values for each random object
 * @param maxValue maximum of the random values for each random object
 * @param sparsity sparsity for each random object
 * @param pdf probability density function for each random object
 * @param replication file replication
 * @param inputs input file for each random object
 * @param outputs output file for each random object
 * @param outputInfos output information for each random object
 * @param instructionsInMapper instruction for each random object
 * @param resultIndexes result indexes for each random object
 * @return matrix characteristics for each random object
 * @throws Exception if an error occurred in the MapReduce phase
 */

public static JobReturn runJob(MRJobInstruction inst, String[] dataGenInstructions, String instructionsInMapper,
        String aggInstructionsInReducer, String otherInstructionsInReducer, int numReducers, int replication,
        byte[] resultIndexes, String dimsUnknownFilePrefix, String[] outputs, OutputInfo[] outputInfos)
        throws Exception {
    JobConf job = new JobConf(DataGenMR.class);
    job.setJobName("DataGen-MR");

    //whether use block representation or cell representation
    MRJobConfiguration.setMatrixValueClass(job, true);

    byte[] realIndexes = new byte[dataGenInstructions.length];
    for (byte b = 0; b < realIndexes.length; b++)
        realIndexes[b] = b;

    String[] inputs = new String[dataGenInstructions.length];
    InputInfo[] inputInfos = new InputInfo[dataGenInstructions.length];
    long[] rlens = new long[dataGenInstructions.length];
    long[] clens = new long[dataGenInstructions.length];
    int[] brlens = new int[dataGenInstructions.length];
    int[] bclens = new int[dataGenInstructions.length];

    FileSystem fs = FileSystem.get(job);
    String dataGenInsStr = "";
    int numblocks = 0;
    int maxbrlen = -1, maxbclen = -1;
    double maxsparsity = -1;

    for (int i = 0; i < dataGenInstructions.length; i++) {
        dataGenInsStr = dataGenInsStr + Lop.INSTRUCTION_DELIMITOR + dataGenInstructions[i];

        MRInstruction mrins = MRInstructionParser.parseSingleInstruction(dataGenInstructions[i]);
        MRINSTRUCTION_TYPE mrtype = mrins.getMRInstructionType();
        DataGenMRInstruction genInst = (DataGenMRInstruction) mrins;

        rlens[i] = genInst.getRows();
        clens[i] = genInst.getCols();
        brlens[i] = genInst.getRowsInBlock();
        bclens[i] = genInst.getColsInBlock();

        maxbrlen = Math.max(maxbrlen, brlens[i]);
        maxbclen = Math.max(maxbclen, bclens[i]);

        if (mrtype == MRINSTRUCTION_TYPE.Rand) {
            RandInstruction randInst = (RandInstruction) mrins;
            inputs[i] = genInst.getBaseDir() + "tmp" + _seqRandInput.getNextID() + ".randinput";
            maxsparsity = Math.max(maxsparsity, randInst.getSparsity());

            FSDataOutputStream fsOut = fs.create(new Path(inputs[i]));
            PrintWriter pw = new PrintWriter(fsOut);

            //for obj reuse and preventing repeated buffer re-allocations
            StringBuilder sb = new StringBuilder();

            //seed generation
            Well1024a bigrand = LibMatrixDatagen.setupSeedsForRand(randInst.getSeed());
            long[] nnz = LibMatrixDatagen.computeNNZperBlock(rlens[i], clens[i], brlens[i], bclens[i],
                    randInst.getSparsity());
            int nnzIx = 0;
            for (long r = 0; r < rlens[i]; r += brlens[i]) {
                long curBlockRowSize = Math.min(brlens[i], (rlens[i] - r));
                for (long c = 0; c < clens[i]; c += bclens[i]) {
                    long curBlockColSize = Math.min(bclens[i], (clens[i] - c));

                    sb.append((r / brlens[i]) + 1);
                    sb.append(',');
                    sb.append((c / bclens[i]) + 1);
                    sb.append(',');
                    sb.append(curBlockRowSize);
                    sb.append(',');
                    sb.append(curBlockColSize);
                    sb.append(',');
                    sb.append(nnz[nnzIx++]);
                    sb.append(',');
                    sb.append(bigrand.nextLong());
                    pw.println(sb.toString());
                    sb.setLength(0);
                    numblocks++;
                }
            }
            pw.close();
            fsOut.close();
            inputInfos[i] = InputInfo.TextCellInputInfo;
        } else if (mrtype == MRINSTRUCTION_TYPE.Seq) {
            SeqInstruction seqInst = (SeqInstruction) mrins;
            inputs[i] = genInst.getBaseDir() + System.currentTimeMillis() + ".seqinput";
            maxsparsity = 1.0; //always dense

            double from = seqInst.fromValue;
            double to = seqInst.toValue;
            double incr = seqInst.incrValue;

            // Correctness checks on (from, to, incr)
            boolean neg = (from > to);
            if (incr == 0)
                throw new DMLRuntimeException("Invalid value for \"increment\" in seq().");

            if (neg != (incr < 0))
                throw new DMLRuntimeException("Wrong sign for the increment in a call to seq()");

            // Compute the number of rows in the sequence
            long numrows = 1 + (long) Math.floor((to - from) / incr);
            if (rlens[i] > 0) {
                if (numrows != rlens[i])
                    throw new DMLRuntimeException(
                            "Unexpected error while processing sequence instruction. Expected number of rows does not match given number: "
                                    + rlens[i] + " != " + numrows);
            } else {
                rlens[i] = numrows;
            }

            if (clens[i] > 0 && clens[i] != 1)
                throw new DMLRuntimeException(
                        "Unexpected error while processing sequence instruction. Number of columns (" + clens[i]
                                + ") must be equal to 1.");
            else
                clens[i] = 1;

            FSDataOutputStream fsOut = fs.create(new Path(inputs[i]));
            PrintWriter pw = new PrintWriter(fsOut);
            StringBuilder sb = new StringBuilder();

            double temp = from;
            double block_from, block_to;
            for (long r = 0; r < rlens[i]; r += brlens[i]) {
                long curBlockRowSize = Math.min(brlens[i], (rlens[i] - r));

                // block (bid_i,bid_j) generates a sequence from the interval [block_from, block_to] (inclusive of both end points of the interval) 
                long bid_i = ((r / brlens[i]) + 1);
                long bid_j = 1;
                block_from = temp;
                block_to = temp + (curBlockRowSize - 1) * incr;
                temp = block_to + incr; // next block starts from here

                sb.append(bid_i);
                sb.append(',');
                sb.append(bid_j);
                sb.append(',');
                /*
                // Need not include block size while generating seq()
                sb.append(curBlockRowSize);
                sb.append(',');
                sb.append(1);
                sb.append(',');*/
                sb.append(block_from);
                sb.append(',');
                sb.append(block_to);
                sb.append(',');
                sb.append(incr);

                pw.println(sb.toString());
                //System.out.println("MapTask " + r + ": " + sb.toString());
                sb.setLength(0);
                numblocks++;
            }

            pw.close();
            fsOut.close();
            inputInfos[i] = InputInfo.TextCellInputInfo;
        } else {
            throw new DMLRuntimeException("Unexpected Data Generation Instruction Type: " + mrtype);
        }
    }
    dataGenInsStr = dataGenInsStr.substring(1);//remove the first ","
    RunningJob runjob;
    MatrixCharacteristics[] stats;
    try {
        //set up the block size
        MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens);

        //set up the input files and their format information
        MRJobConfiguration.setUpMultipleInputs(job, realIndexes, inputs, inputInfos, brlens, bclens, false,
                ConvertTarget.BLOCK);

        //set up the dimensions of input matrices
        MRJobConfiguration.setMatricesDimensions(job, realIndexes, rlens, clens);
        MRJobConfiguration.setDimsUnknownFilePrefix(job, dimsUnknownFilePrefix);

        //set up the block size
        MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens);

        //set up the rand Instructions
        MRJobConfiguration.setRandInstructions(job, dataGenInsStr);

        //set up unary instructions that will perform in the mapper
        MRJobConfiguration.setInstructionsInMapper(job, instructionsInMapper);

        //set up the aggregate instructions that will happen in the combiner and reducer
        MRJobConfiguration.setAggregateInstructions(job, aggInstructionsInReducer);

        //set up the instructions that will happen in the reducer, after the aggregation instrucions
        MRJobConfiguration.setInstructionsInReducer(job, otherInstructionsInReducer);

        //set up the replication factor for the results
        job.setInt("dfs.replication", replication);

        //set up map/reduce memory configurations (if in AM context)
        DMLConfig config = ConfigurationManager.getConfig();
        DMLAppMasterUtils.setupMRJobRemoteMaxMemory(job, config);

        //determine degree of parallelism (nmappers: 1<=n<=capacity)
        //TODO use maxsparsity whenever we have a way of generating sparse rand data
        int capacity = InfrastructureAnalyzer.getRemoteParallelMapTasks();
        long dfsblocksize = InfrastructureAnalyzer.getHDFSBlockSize();
        //correction max number of mappers on yarn clusters
        if (InfrastructureAnalyzer.isYarnEnabled())
            capacity = (int) Math.max(capacity, YarnClusterAnalyzer.getNumCores());
        int nmapers = Math
                .max(Math.min((int) (8 * maxbrlen * maxbclen * (long) numblocks / dfsblocksize), capacity), 1);
        job.setNumMapTasks(nmapers);

        //set up what matrices are needed to pass from the mapper to reducer
        HashSet<Byte> mapoutputIndexes = MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes,
                dataGenInsStr, instructionsInMapper, null, aggInstructionsInReducer, otherInstructionsInReducer,
                resultIndexes);

        MatrixChar_N_ReducerGroups ret = MRJobConfiguration.computeMatrixCharacteristics(job, realIndexes,
                dataGenInsStr, instructionsInMapper, null, aggInstructionsInReducer, null,
                otherInstructionsInReducer, resultIndexes, mapoutputIndexes, false);
        stats = ret.stats;

        //set up the number of reducers
        MRJobConfiguration.setNumReducers(job, ret.numReducerGroups, numReducers);

        // print the complete MRJob instruction
        if (LOG.isTraceEnabled())
            inst.printCompleteMRJobInstruction(stats);

        // Update resultDimsUnknown based on computed "stats"
        byte[] resultDimsUnknown = new byte[resultIndexes.length];
        for (int i = 0; i < resultIndexes.length; i++) {
            if (stats[i].getRows() == -1 || stats[i].getCols() == -1) {
                resultDimsUnknown[i] = (byte) 1;
            } else {
                resultDimsUnknown[i] = (byte) 0;
            }
        }

        boolean mayContainCtable = instructionsInMapper.contains("ctabletransform")
                || instructionsInMapper.contains("groupedagg");

        //set up the multiple output files, and their format information
        MRJobConfiguration.setUpMultipleOutputs(job, resultIndexes, resultDimsUnknown, outputs, outputInfos,
                true, mayContainCtable);

        // configure mapper and the mapper output key value pairs
        job.setMapperClass(DataGenMapper.class);
        if (numReducers == 0) {
            job.setMapOutputKeyClass(Writable.class);
            job.setMapOutputValueClass(Writable.class);
        } else {
            job.setMapOutputKeyClass(MatrixIndexes.class);
            job.setMapOutputValueClass(TaggedMatrixBlock.class);
        }

        //set up combiner
        if (numReducers != 0 && aggInstructionsInReducer != null && !aggInstructionsInReducer.isEmpty())
            job.setCombinerClass(GMRCombiner.class);

        //configure reducer
        job.setReducerClass(GMRReducer.class);
        //job.setReducerClass(PassThroughReducer.class);

        // By default, the job executes in "cluster" mode.
        // Determine if we can optimize and run it in "local" mode.
        MatrixCharacteristics[] inputStats = new MatrixCharacteristics[inputs.length];
        for (int i = 0; i < inputs.length; i++) {
            inputStats[i] = new MatrixCharacteristics(rlens[i], clens[i], brlens[i], bclens[i]);
        }

        //set unique working dir
        MRJobConfiguration.setUniqueWorkingDir(job);

        runjob = JobClient.runJob(job);

        /* Process different counters */

        Group group = runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS);
        for (int i = 0; i < resultIndexes.length; i++) {
            // number of non-zeros
            stats[i].setNonZeros(group.getCounter(Integer.toString(i)));
        }

        String dir = dimsUnknownFilePrefix + "/" + runjob.getID().toString() + "_dimsFile";
        stats = MapReduceTool.processDimsFiles(dir, stats);
        MapReduceTool.deleteFileIfExistOnHDFS(dir);

    } finally {
        for (String input : inputs)
            MapReduceTool.deleteFileIfExistOnHDFS(new Path(input), job);
    }

    return new JobReturn(stats, outputInfos, runjob.isSuccessful());
}

From source file:com.ibm.bi.dml.runtime.instructions.spark.RandSPInstruction.java

/**
 * Helper function to construct a sample.
 * //w  w w  .j  a v a2 s  .co  m
 * @param sec
 * @throws DMLRuntimeException
 */
private void generateSample(SparkExecutionContext sec) throws DMLRuntimeException {
    if (maxValue < rows && !replace)
        throw new DMLRuntimeException("Sample (size=" + rows + ") larger than population (size=" + maxValue
                + ") can only be generated with replacement.");

    if (LOG.isTraceEnabled())
        LOG.trace("Process RandSPInstruction sample with range=" + maxValue + ", size=" + rows + ", replace="
                + replace + ", seed=" + seed);

    // sampling rate that guarantees a sample of size >= sampleSizeLowerBound 99.99% of the time.
    double fraction = SamplingUtils.computeFractionForSampleSize((int) rows, UtilFunctions.toLong(maxValue),
            replace);

    Well1024a bigrand = LibMatrixDatagen.setupSeedsForRand(seed);

    // divide the population range across numPartitions by creating SampleTasks
    double hdfsBlockSize = InfrastructureAnalyzer.getHDFSBlockSize();
    long outputSize = MatrixBlock.estimateSizeDenseInMemory(rows, 1);
    int numPartitions = (int) Math.ceil((double) outputSize / hdfsBlockSize);
    long partitionSize = (long) Math.ceil(maxValue / numPartitions);

    ArrayList<SampleTask> offsets = new ArrayList<SampleTask>();
    long st = 1;
    while (st <= maxValue) {
        SampleTask s = new SampleTask();
        s.range_start = st;
        s.seed = bigrand.nextLong();

        offsets.add(s);
        st = st + partitionSize;
    }
    JavaRDD<SampleTask> offsetRDD = sec.getSparkContext().parallelize(offsets, numPartitions);

    // Construct the sample in a distributed manner
    JavaRDD<Double> rdd = offsetRDD
            .flatMap((new GenerateSampleBlock(replace, fraction, (long) maxValue, partitionSize)));

    // Randomize the sampled elements
    JavaRDD<Double> randomizedRDD = rdd.mapToPair(new AttachRandom()).sortByKey().values();

    // Trim the sampled list to required size & attach matrix indexes to randomized elements
    JavaPairRDD<MatrixIndexes, MatrixCell> miRDD = randomizedRDD.zipWithIndex().filter(new TrimSample(rows))
            .mapToPair(new Double2MatrixCell());

    MatrixCharacteristics mcOut = new MatrixCharacteristics(rows, 1, rowsInBlock, colsInBlock, rows);

    // Construct BinaryBlock representation
    JavaPairRDD<MatrixIndexes, MatrixBlock> mbRDD = RDDConverterUtils
            .binaryCellToBinaryBlock(sec.getSparkContext(), miRDD, mcOut, true);

    MatrixCharacteristics retDims = sec.getMatrixCharacteristics(output.getName());
    retDims.setNonZeros(rows);

    sec.setRDDHandleForVariable(output.getName(), mbRDD);
}

From source file:com.ibm.bi.dml.runtime.instructions.spark.RandSPInstruction.java

@Override
public void processInstruction(ExecutionContext ec) throws DMLRuntimeException {
    SparkExecutionContext sec = (SparkExecutionContext) ec;

    //process specific datagen operator
    if (this.method == DataGenMethod.RAND) {
        // The implementation is in same spirit as MapReduce
        // We generate seeds similar to com.ibm.bi.dml.runtime.matrix.DataGenMR
        // and then generate blocks similar to com.ibm.bi.dml.runtime.matrix.mapred.DataGenMapper

        //generate pseudo-random seed (because not specified) 
        long lSeed = seed; //seed per invocation
        if (lSeed == DataGenOp.UNSPECIFIED_SEED)
            lSeed = DataGenOp.generateRandomSeed();

        if (LOG.isTraceEnabled())
            LOG.trace("Process RandSPInstruction rand with seed = " + lSeed + ".");

        //Check if there is sufficient memory for matrix to be created and execution platform is not forced Spark
        if (isMemAvail(rows, cols, sparsity, minValue, maxValue)
                && DMLScript.rtplatform != RUNTIME_PLATFORM.SPARK) {
            RandomMatrixGenerator rgen = LibMatrixDatagen.createRandomMatrixGenerator(pdf, (int) rows,
                    (int) cols, rowsInBlock, colsInBlock, sparsity, minValue, maxValue, pdfParams);
            MatrixBlock mb = MatrixBlock.randOperations(rgen, lSeed);

            sec.setMatrixOutput(output.getName(), mb);
            Statistics.decrementNoOfExecutedSPInst();
            return;
        }/*  www. j a va  2 s .  c o  m*/

        // seed generation (partitioned to bound memory requirements)
        JavaPairRDD<MatrixIndexes, Tuple2<Long, Long>> seedsRDD = null;
        Well1024a bigrand = LibMatrixDatagen.setupSeedsForRand(lSeed);
        long[] nnz = LibMatrixDatagen.computeNNZperBlock(rows, cols, rowsInBlock, colsInBlock, sparsity);
        double hdfsBlockSize = InfrastructureAnalyzer.getHDFSBlockSize();
        long numBlocks = nnz.length;
        long numColBlocks = (long) Math.ceil((double) cols / (double) colsInBlock);

        for (long p = 0; p < numBlocks; p += SEED_PARTITION_SIZE) {
            ArrayList<Tuple2<MatrixIndexes, Tuple2<Long, Long>>> seeds = new ArrayList<Tuple2<MatrixIndexes, Tuple2<Long, Long>>>();
            double partitionSize = 0;
            for (long i = p; i < Math.min(p + SEED_PARTITION_SIZE, numBlocks); i++) {
                long r = 1 + i / numColBlocks;
                long c = 1 + i % numColBlocks;
                MatrixIndexes indx = new MatrixIndexes(r, c);
                Long seedForBlock = bigrand.nextLong();
                seeds.add(new Tuple2<MatrixIndexes, Tuple2<Long, Long>>(indx,
                        new Tuple2<Long, Long>(seedForBlock, nnz[(int) i])));
                partitionSize += nnz[(int) i] * 8 + 16;
            }

            //for load balancing: degree of parallelism such that ~128MB per partition
            int numPartitions = (int) Math.max(Math.min(partitionSize / hdfsBlockSize, seeds.size()), 1);

            //combine seeds partitions to seed rdd
            JavaPairRDD<MatrixIndexes, Tuple2<Long, Long>> seedsRDD2 = JavaPairRDD
                    .fromJavaRDD(sec.getSparkContext().parallelize(seeds, numPartitions));
            seedsRDD = (seedsRDD != null) ? seedsRDD.union(seedsRDD2) : seedsRDD2;
        }

        //execute rand instruction over seed input
        JavaPairRDD<MatrixIndexes, MatrixBlock> out = seedsRDD.mapToPair(new GenerateRandomBlock(rows, cols,
                rowsInBlock, colsInBlock, sparsity, minValue, maxValue, pdf, pdfParams));

        //output handling
        MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(output.getName());
        if (!mcOut.dimsKnown(true)) {
            //note: we cannot compute the nnz from sparsity because this would not reflect the 
            //actual number of non-zeros, except for extreme values of sparsity equals 0 or 1.
            long lnnz = (sparsity == 0 || sparsity == 1) ? (long) (sparsity * rows * cols) : -1;
            mcOut.set(rows, cols, rowsInBlock, colsInBlock, lnnz);
        }
        sec.setRDDHandleForVariable(output.getName(), out);
    } else if (this.method == DataGenMethod.SEQ) {
        //sanity check valid increment
        if (seq_incr == 0) {
            throw new DMLRuntimeException(
                    "ERROR: While performing seq(" + seq_from + "," + seq_to + "," + seq_incr + ")");
        }

        if (LOG.isTraceEnabled())
            LOG.trace("Process RandSPInstruction seq with seqFrom=" + seq_from + ", seqTo=" + seq_to
                    + ", seqIncr" + seq_incr);

        // offset generation (partitioned to bound memory requirements)
        JavaRDD<Double> offsetsRDD = null;
        double hdfsBlockSize = InfrastructureAnalyzer.getHDFSBlockSize();
        long nnz = (long) Math.abs(Math.round((seq_to - seq_from) / seq_incr)) + 1;
        long numBlocks = (long) Math.ceil(((double) nnz) / rowsInBlock);

        for (long p = 0; p < numBlocks; p += SEED_PARTITION_SIZE) {
            ArrayList<Double> offsets = new ArrayList<Double>();
            double partitionSize = 0;
            for (long i = p; i < Math.min(p + SEED_PARTITION_SIZE, numBlocks); i++) {
                double off = seq_from + seq_incr * i * rowsInBlock;
                offsets.add(off);
                partitionSize += rowsInBlock * 8 + 16;
            }

            //for load balancing: degree of parallelism such that ~128MB per partition
            int numPartitions = (int) Math.max(Math.min(partitionSize / hdfsBlockSize, offsets.size()), 1);

            //combine seeds partitions to seed rdd
            JavaRDD<Double> offsetsRDD2 = sec.getSparkContext().parallelize(offsets, numPartitions);

            offsetsRDD = (offsetsRDD != null) ? offsetsRDD.union(offsetsRDD2) : offsetsRDD2;
        }

        //sanity check number of non-zeros
        if (nnz != rows && rows != -1) {
            throw new DMLRuntimeException("Incorrect number of non-zeros: " + nnz + " != " + rows);
        }

        //execute seq instruction over offset input
        JavaPairRDD<MatrixIndexes, MatrixBlock> out = offsetsRDD
                .mapToPair(new GenerateSequenceBlock(rowsInBlock, seq_from, seq_to, seq_incr));

        //output handling
        MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(output.getName());
        if (!mcOut.dimsKnown()) {
            mcOut.set(nnz, 1, rowsInBlock, colsInBlock, nnz);
        }
        sec.setRDDHandleForVariable(output.getName(), out);
    } else if (this.method == DataGenMethod.SAMPLE) {
        generateSample(sec);
    }
}

From source file:org.apache.sysml.runtime.instructions.spark.RandSPInstruction.java

private void generateRandData(SparkExecutionContext sec) throws DMLRuntimeException {
    //step 1: generate pseudo-random seed (because not specified) 
    long lSeed = seed; //seed per invocation
    if (lSeed == DataGenOp.UNSPECIFIED_SEED)
        lSeed = DataGenOp.generateRandomSeed();

    if (LOG.isTraceEnabled())
        LOG.trace("Process RandSPInstruction rand with seed = " + lSeed + ".");

    //step 2: potential in-memory rand operations if applicable
    if (isMemAvail(rows, cols, sparsity, minValue, maxValue)
            && DMLScript.rtplatform != RUNTIME_PLATFORM.SPARK) {
        RandomMatrixGenerator rgen = LibMatrixDatagen.createRandomMatrixGenerator(pdf, (int) rows, (int) cols,
                rowsInBlock, colsInBlock, sparsity, minValue, maxValue, pdfParams);
        MatrixBlock mb = MatrixBlock.randOperations(rgen, lSeed);

        sec.setMatrixOutput(output.getName(), mb);
        Statistics.decrementNoOfExecutedSPInst();
        return;/*  ww  w .java  2s.  c  o m*/
    }

    //step 3: seed generation 
    JavaPairRDD<MatrixIndexes, Tuple2<Long, Long>> seedsRDD = null;
    Well1024a bigrand = LibMatrixDatagen.setupSeedsForRand(lSeed);
    long[] nnz = LibMatrixDatagen.computeNNZperBlock(rows, cols, rowsInBlock, colsInBlock, sparsity);
    double totalSize = OptimizerUtils.estimatePartitionedSizeExactSparsity(rows, cols, rowsInBlock, colsInBlock,
            rows * cols * sparsity); //overestimate for on disk, ensures hdfs block per partition
    double hdfsBlkSize = InfrastructureAnalyzer.getHDFSBlockSize();
    long numBlocks = nnz.length;
    long numColBlocks = (long) Math.ceil((double) cols / (double) colsInBlock);

    //a) in-memory seed rdd construction 
    if (numBlocks < INMEMORY_NUMBLOCKS_THRESHOLD) {
        ArrayList<Tuple2<MatrixIndexes, Tuple2<Long, Long>>> seeds = new ArrayList<Tuple2<MatrixIndexes, Tuple2<Long, Long>>>();
        for (long i = 0; i < numBlocks; i++) {
            long r = 1 + i / numColBlocks;
            long c = 1 + i % numColBlocks;
            MatrixIndexes indx = new MatrixIndexes(r, c);
            Long seedForBlock = bigrand.nextLong();
            seeds.add(new Tuple2<MatrixIndexes, Tuple2<Long, Long>>(indx,
                    new Tuple2<Long, Long>(seedForBlock, nnz[(int) i])));
        }

        //for load balancing: degree of parallelism such that ~128MB per partition
        int numPartitions = (int) Math.max(Math.min(totalSize / hdfsBlkSize, numBlocks), 1);

        //create seeds rdd 
        seedsRDD = sec.getSparkContext().parallelizePairs(seeds, numPartitions);
    }
    //b) file-based seed rdd construction (for robustness wrt large number of blocks)
    else {
        String path = LibMatrixDatagen.generateUniqueSeedPath(dir);

        try {
            FileSystem fs = FileSystem.get(ConfigurationManager.getCachedJobConf());
            FSDataOutputStream fsOut = fs.create(new Path(path));
            PrintWriter pw = new PrintWriter(fsOut);
            StringBuilder sb = new StringBuilder();
            for (long i = 0; i < numBlocks; i++) {
                sb.append(1 + i / numColBlocks);
                sb.append(',');
                sb.append(1 + i % numColBlocks);
                sb.append(',');
                sb.append(bigrand.nextLong());
                sb.append(',');
                sb.append(nnz[(int) i]);
                pw.println(sb.toString());
                sb.setLength(0);
            }
            pw.close();
            fsOut.close();
        } catch (IOException ex) {
            throw new DMLRuntimeException(ex);
        }

        //for load balancing: degree of parallelism such that ~128MB per partition
        int numPartitions = (int) Math.max(Math.min(totalSize / hdfsBlkSize, numBlocks), 1);

        //create seeds rdd 
        seedsRDD = sec.getSparkContext().textFile(path, numPartitions).mapToPair(new ExtractSeedTuple());
    }

    //step 4: execute rand instruction over seed input
    JavaPairRDD<MatrixIndexes, MatrixBlock> out = seedsRDD.mapToPair(new GenerateRandomBlock(rows, cols,
            rowsInBlock, colsInBlock, sparsity, minValue, maxValue, pdf, pdfParams));

    //step 5: output handling
    MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(output.getName());
    if (!mcOut.dimsKnown(true)) {
        //note: we cannot compute the nnz from sparsity because this would not reflect the 
        //actual number of non-zeros, except for extreme values of sparsity equals 0 or 1.
        long lnnz = (sparsity == 0 || sparsity == 1) ? (long) (sparsity * rows * cols) : -1;
        mcOut.set(rows, cols, rowsInBlock, colsInBlock, lnnz);
    }
    sec.setRDDHandleForVariable(output.getName(), out);
}

From source file:org.apache.sysml.runtime.matrix.data.LibMatrixDatagen.java

private static long[] generateSeedsForCP(Well1024a bigrand, int nrb, int ncb) {
    int numBlocks = nrb * ncb;
    long[] seeds = new long[numBlocks];
    for (int l = 0; l < numBlocks; l++)
        seeds[l] = bigrand.nextLong();

    return seeds;
}

From source file:org.apache.sysml.runtime.matrix.DataGenMR.java

/**
 * <p>Starts a Rand MapReduce job which will produce one or more random objects.</p>
 * /*from   www  .j  a va 2s  .c  o  m*/
 * @param inst MR job instruction
 * @param dataGenInstructions array of data gen instructions
 * @param instructionsInMapper instructions in mapper
 * @param aggInstructionsInReducer aggregate instructions in reducer
 * @param otherInstructionsInReducer other instructions in reducer
 * @param numReducers number of reducers
 * @param replication file replication
 * @param resultIndexes result indexes for each random object
 * @param dimsUnknownFilePrefix file path prefix when dimensions unknown
 * @param outputs output file for each random object
 * @param outputInfos output information for each random object
 * @return matrix characteristics for each random object
 * @throws Exception if Exception occurs
 */
public static JobReturn runJob(MRJobInstruction inst, String[] dataGenInstructions, String instructionsInMapper,
        String aggInstructionsInReducer, String otherInstructionsInReducer, int numReducers, int replication,
        byte[] resultIndexes, String dimsUnknownFilePrefix, String[] outputs, OutputInfo[] outputInfos)
        throws Exception {
    JobConf job = new JobConf(DataGenMR.class);
    job.setJobName("DataGen-MR");

    //whether use block representation or cell representation
    MRJobConfiguration.setMatrixValueClass(job, true);

    byte[] realIndexes = new byte[dataGenInstructions.length];
    for (byte b = 0; b < realIndexes.length; b++)
        realIndexes[b] = b;

    String[] inputs = new String[dataGenInstructions.length];
    InputInfo[] inputInfos = new InputInfo[dataGenInstructions.length];
    long[] rlens = new long[dataGenInstructions.length];
    long[] clens = new long[dataGenInstructions.length];
    int[] brlens = new int[dataGenInstructions.length];
    int[] bclens = new int[dataGenInstructions.length];

    FileSystem fs = FileSystem.get(job);
    String dataGenInsStr = "";
    int numblocks = 0;
    int maxbrlen = -1, maxbclen = -1;
    double maxsparsity = -1;

    for (int i = 0; i < dataGenInstructions.length; i++) {
        dataGenInsStr = dataGenInsStr + Lop.INSTRUCTION_DELIMITOR + dataGenInstructions[i];

        MRInstruction mrins = MRInstructionParser.parseSingleInstruction(dataGenInstructions[i]);
        MRINSTRUCTION_TYPE mrtype = mrins.getMRInstructionType();
        DataGenMRInstruction genInst = (DataGenMRInstruction) mrins;

        rlens[i] = genInst.getRows();
        clens[i] = genInst.getCols();
        brlens[i] = genInst.getRowsInBlock();
        bclens[i] = genInst.getColsInBlock();

        maxbrlen = Math.max(maxbrlen, brlens[i]);
        maxbclen = Math.max(maxbclen, bclens[i]);

        if (mrtype == MRINSTRUCTION_TYPE.Rand) {
            RandInstruction randInst = (RandInstruction) mrins;
            inputs[i] = LibMatrixDatagen.generateUniqueSeedPath(genInst.getBaseDir());
            maxsparsity = Math.max(maxsparsity, randInst.getSparsity());

            FSDataOutputStream fsOut = fs.create(new Path(inputs[i]));
            PrintWriter pw = new PrintWriter(fsOut);

            //for obj reuse and preventing repeated buffer re-allocations
            StringBuilder sb = new StringBuilder();

            //seed generation
            Well1024a bigrand = LibMatrixDatagen.setupSeedsForRand(randInst.getSeed());
            long[] nnz = LibMatrixDatagen.computeNNZperBlock(rlens[i], clens[i], brlens[i], bclens[i],
                    randInst.getSparsity());
            int nnzIx = 0;
            for (long r = 0; r < rlens[i]; r += brlens[i]) {
                long curBlockRowSize = Math.min(brlens[i], (rlens[i] - r));
                for (long c = 0; c < clens[i]; c += bclens[i]) {
                    long curBlockColSize = Math.min(bclens[i], (clens[i] - c));

                    sb.append((r / brlens[i]) + 1);
                    sb.append(',');
                    sb.append((c / bclens[i]) + 1);
                    sb.append(',');
                    sb.append(curBlockRowSize);
                    sb.append(',');
                    sb.append(curBlockColSize);
                    sb.append(',');
                    sb.append(nnz[nnzIx++]);
                    sb.append(',');
                    sb.append(bigrand.nextLong());
                    pw.println(sb.toString());
                    sb.setLength(0);
                    numblocks++;
                }
            }
            pw.close();
            fsOut.close();
            inputInfos[i] = InputInfo.TextCellInputInfo;
        } else if (mrtype == MRINSTRUCTION_TYPE.Seq) {
            SeqInstruction seqInst = (SeqInstruction) mrins;
            inputs[i] = genInst.getBaseDir() + System.currentTimeMillis() + ".seqinput";
            maxsparsity = 1.0; //always dense

            double from = seqInst.fromValue;
            double to = seqInst.toValue;
            double incr = seqInst.incrValue;

            //handle default 1 to -1 for special case of from>to
            incr = LibMatrixDatagen.updateSeqIncr(from, to, incr);

            // Correctness checks on (from, to, incr)
            boolean neg = (from > to);
            if (incr == 0)
                throw new DMLRuntimeException("Invalid value for \"increment\" in seq().");

            if (neg != (incr < 0))
                throw new DMLRuntimeException("Wrong sign for the increment in a call to seq()");

            // Compute the number of rows in the sequence
            long numrows = 1 + (long) Math.floor((to - from) / incr);
            if (rlens[i] > 0) {
                if (numrows != rlens[i])
                    throw new DMLRuntimeException(
                            "Unexpected error while processing sequence instruction. Expected number of rows does not match given number: "
                                    + rlens[i] + " != " + numrows);
            } else {
                rlens[i] = numrows;
            }

            if (clens[i] > 0 && clens[i] != 1)
                throw new DMLRuntimeException(
                        "Unexpected error while processing sequence instruction. Number of columns (" + clens[i]
                                + ") must be equal to 1.");
            else
                clens[i] = 1;

            FSDataOutputStream fsOut = fs.create(new Path(inputs[i]));
            PrintWriter pw = new PrintWriter(fsOut);
            StringBuilder sb = new StringBuilder();

            double temp = from;
            double block_from, block_to;
            for (long r = 0; r < rlens[i]; r += brlens[i]) {
                long curBlockRowSize = Math.min(brlens[i], (rlens[i] - r));

                // block (bid_i,bid_j) generates a sequence from the interval [block_from, block_to] (inclusive of both end points of the interval) 
                long bid_i = ((r / brlens[i]) + 1);
                long bid_j = 1;
                block_from = temp;
                block_to = temp + (curBlockRowSize - 1) * incr;
                temp = block_to + incr; // next block starts from here

                sb.append(bid_i);
                sb.append(',');
                sb.append(bid_j);
                sb.append(',');
                /*
                // Need not include block size while generating seq()
                sb.append(curBlockRowSize);
                sb.append(',');
                sb.append(1);
                sb.append(',');*/
                sb.append(block_from);
                sb.append(',');
                sb.append(block_to);
                sb.append(',');
                sb.append(incr);

                pw.println(sb.toString());
                //System.out.println("MapTask " + r + ": " + sb.toString());
                sb.setLength(0);
                numblocks++;
            }

            pw.close();
            fsOut.close();
            inputInfos[i] = InputInfo.TextCellInputInfo;
        } else {
            throw new DMLRuntimeException("Unexpected Data Generation Instruction Type: " + mrtype);
        }
    }
    dataGenInsStr = dataGenInsStr.substring(1);//remove the first ","
    RunningJob runjob;
    MatrixCharacteristics[] stats;
    try {
        //set up the block size
        MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens);

        //set up the input files and their format information
        MRJobConfiguration.setUpMultipleInputs(job, realIndexes, inputs, inputInfos, brlens, bclens, false,
                ConvertTarget.BLOCK);

        //set up the dimensions of input matrices
        MRJobConfiguration.setMatricesDimensions(job, realIndexes, rlens, clens);
        MRJobConfiguration.setDimsUnknownFilePrefix(job, dimsUnknownFilePrefix);

        //set up the block size
        MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens);

        //set up the rand Instructions
        MRJobConfiguration.setRandInstructions(job, dataGenInsStr);

        //set up unary instructions that will perform in the mapper
        MRJobConfiguration.setInstructionsInMapper(job, instructionsInMapper);

        //set up the aggregate instructions that will happen in the combiner and reducer
        MRJobConfiguration.setAggregateInstructions(job, aggInstructionsInReducer);

        //set up the instructions that will happen in the reducer, after the aggregation instrucions
        MRJobConfiguration.setInstructionsInReducer(job, otherInstructionsInReducer);

        //set up the replication factor for the results
        job.setInt(MRConfigurationNames.DFS_REPLICATION, replication);

        //set up map/reduce memory configurations (if in AM context)
        DMLConfig config = ConfigurationManager.getDMLConfig();
        DMLAppMasterUtils.setupMRJobRemoteMaxMemory(job, config);

        //set up custom map/reduce configurations 
        MRJobConfiguration.setupCustomMRConfigurations(job, config);

        //determine degree of parallelism (nmappers: 1<=n<=capacity)
        //TODO use maxsparsity whenever we have a way of generating sparse rand data
        int capacity = InfrastructureAnalyzer.getRemoteParallelMapTasks();
        long dfsblocksize = InfrastructureAnalyzer.getHDFSBlockSize();
        //correction max number of mappers on yarn clusters
        if (InfrastructureAnalyzer.isYarnEnabled())
            capacity = (int) Math.max(capacity, YarnClusterAnalyzer.getNumCores());
        int nmapers = Math
                .max(Math.min((int) (8 * maxbrlen * maxbclen * (long) numblocks / dfsblocksize), capacity), 1);
        job.setNumMapTasks(nmapers);

        //set up what matrices are needed to pass from the mapper to reducer
        HashSet<Byte> mapoutputIndexes = MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes,
                dataGenInsStr, instructionsInMapper, null, aggInstructionsInReducer, otherInstructionsInReducer,
                resultIndexes);

        MatrixChar_N_ReducerGroups ret = MRJobConfiguration.computeMatrixCharacteristics(job, realIndexes,
                dataGenInsStr, instructionsInMapper, null, aggInstructionsInReducer, null,
                otherInstructionsInReducer, resultIndexes, mapoutputIndexes, false);
        stats = ret.stats;

        //set up the number of reducers
        MRJobConfiguration.setNumReducers(job, ret.numReducerGroups, numReducers);

        // print the complete MRJob instruction
        if (LOG.isTraceEnabled())
            inst.printCompleteMRJobInstruction(stats);

        // Update resultDimsUnknown based on computed "stats"
        byte[] resultDimsUnknown = new byte[resultIndexes.length];
        for (int i = 0; i < resultIndexes.length; i++) {
            if (stats[i].getRows() == -1 || stats[i].getCols() == -1) {
                resultDimsUnknown[i] = (byte) 1;
            } else {
                resultDimsUnknown[i] = (byte) 0;
            }
        }

        boolean mayContainCtable = instructionsInMapper.contains("ctabletransform")
                || instructionsInMapper.contains("groupedagg");

        //set up the multiple output files, and their format information
        MRJobConfiguration.setUpMultipleOutputs(job, resultIndexes, resultDimsUnknown, outputs, outputInfos,
                true, mayContainCtable);

        // configure mapper and the mapper output key value pairs
        job.setMapperClass(DataGenMapper.class);
        if (numReducers == 0) {
            job.setMapOutputKeyClass(Writable.class);
            job.setMapOutputValueClass(Writable.class);
        } else {
            job.setMapOutputKeyClass(MatrixIndexes.class);
            job.setMapOutputValueClass(TaggedMatrixBlock.class);
        }

        //set up combiner
        if (numReducers != 0 && aggInstructionsInReducer != null && !aggInstructionsInReducer.isEmpty())
            job.setCombinerClass(GMRCombiner.class);

        //configure reducer
        job.setReducerClass(GMRReducer.class);
        //job.setReducerClass(PassThroughReducer.class);

        // By default, the job executes in "cluster" mode.
        // Determine if we can optimize and run it in "local" mode.
        MatrixCharacteristics[] inputStats = new MatrixCharacteristics[inputs.length];
        for (int i = 0; i < inputs.length; i++) {
            inputStats[i] = new MatrixCharacteristics(rlens[i], clens[i], brlens[i], bclens[i]);
        }

        //set unique working dir
        MRJobConfiguration.setUniqueWorkingDir(job);

        runjob = JobClient.runJob(job);

        /* Process different counters */

        Group group = runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS);
        for (int i = 0; i < resultIndexes.length; i++) {
            // number of non-zeros
            stats[i].setNonZeros(group.getCounter(Integer.toString(i)));
        }

        String dir = dimsUnknownFilePrefix + "/" + runjob.getID().toString() + "_dimsFile";
        stats = MapReduceTool.processDimsFiles(dir, stats);
        MapReduceTool.deleteFileIfExistOnHDFS(dir);

    } finally {
        for (String input : inputs)
            MapReduceTool.deleteFileIfExistOnHDFS(new Path(input), job);
    }

    return new JobReturn(stats, outputInfos, runjob.isSuccessful());
}