Example usage for org.apache.mahout.math DenseMatrix DenseMatrix

Introduction

In this page you can find the example usage for org.apache.mahout.math DenseMatrix DenseMatrix.

Prototype

public DenseMatrix(double[][] values)

Source Link

Document

Construct a matrix from the given values

Usage

From source file:com.cloudera.science.ml.parallel.covariance.MahalanobisDistance.java

License:Open Source License

public void initialize() {
    if (m == null) {
        this.m = new DenseVector(means);
        this.ci = new DenseMatrix(covInv);
    }/*from   w w  w .  j ava 2 s . co m*/
}

From source file:com.twitter.algebra.AlgebraCommon.java

License:Apache License

/**
 * Convert a 2-dimensional array to a dense matrix in {@link MapDir} format
 * @param vectors a 2-dimensional array of doubles
 * @param outPath the path to which the dense matrix will be written
 * @param tmpPath an argument required to be passed to {@link DistributedRowMatrix}
 * @param label a unique label to name the output matrix directory
 * @return a {@link DistributedRowMatrix} pointing to the in-filesystem matrix
 * @throws Exception/*from   w  ww .j  ava2  s .  c  om*/
 */
public static DistributedRowMatrix toDenseMapDir(double[][] vectors, Path outPath, Path tmpPath, String label)
        throws Exception {
    DenseMatrix m = new DenseMatrix(vectors);
    return AlgebraCommon.toMapDir(m, outPath, tmpPath, label);
}

From source file:org.carrot2.matrix.factorization.PartialSingularValueDecomposition.java

License:Open Source License

public void compute() {
    // Use Colt's SVD
    SingularValueDecomposition svd;/*  www .  ja  va2 s  . com*/
    if (A.columns() > A.rows()) {
        svd = new SingularValueDecomposition(new DenseMatrix(A.viewDice().toArray()));
        V = toColtMatrix(svd.getU());
        U = toColtMatrix(svd.getV());
    } else {
        svd = new SingularValueDecomposition(new DenseMatrix(A.toArray()));
        U = toColtMatrix(svd.getU());
        V = toColtMatrix(svd.getV());
    }

    S = svd.getSingularValues();

    if (k > 0 && k < S.length) {
        U = U.viewPart(0, 0, U.rows(), k);
        V = V.viewPart(0, 0, V.rows(), k);
        S = Arrays.copyOf(S, k);
    }
}

From source file:org.qcri.pca.CompositeJobTest.java

License:Apache License

@Before
public void setup() throws Exception {
    conf = new Configuration();
    long currTime = System.currentTimeMillis();
    Path outputDir = new Path("/tmp/" + currTime);
    FileSystem fs;/*from  w w w. j a v a 2s . c om*/
    try {
        fs = FileSystem.get(outputDir.toUri(), conf);
        fs.mkdirs(outputDir);
        fs.deleteOnExit(outputDir);
    } catch (IOException e) {
        e.printStackTrace();
        Assert.fail("Error in creating output direcoty " + outputDir);
        return;
    }
    ym = computeMean(inputVectors);
    double[] xm = new double[xsize];
    times(ym, inMemMatrix, xm);
    ymPath = PCACommon.toDistributedVector(new DenseVector(ym), outputDir, "ym", conf);
    xmPath = PCACommon.toDistributedVector(new DenseVector(xm), outputDir, "xm", conf);
    DistributedRowMatrix distMatrix = PCACommon.toDistributedRowMatrix(new DenseMatrix(inMemMatrix), outputDir,
            outputDir, "inMemMatrix");
    inMemMatrixPath = distMatrix.getRowPath();
    for (double[] row : xtx)
        for (int c = 0; c < row.length; c++)
            row[c] = 0;
    for (double[] row : ytx)
        for (int c = 0; c < row.length; c++)
            row[c] = 0;
    computeXtXandYtX(inputVectors);
}

From source file:org.qcri.pca.PCACommon.java

/** A hard-coded initialization matrix used for validation
  * //from   w ww  .  java2 s . co m
  * @param rows
  * @param cols
  * @return
*/

static Matrix randomValidationMatrix(int rows, int cols) {
    double randomArray[][] = { { 0.730967787376657, 0.24053641567148587, 0.6374174253501083 },
            { 0.5504370051176339, 0.5975452777972018, 0.3332183994766498 },
            { 0.3851891847407185, 0.984841540199809, 0.8791825178724801 },
            { 0.9412491794821144, 0.27495396603548483, 0.12889715087377673 },
            { 0.14660165764651822, 0.023238122483889456, 0.5467397571984656 } };
    DenseMatrix matrix = new DenseMatrix(randomArray);
    return matrix;
}

From source file:org.qcri.pca.ReconstructionErrJobTest.java

License:Apache License

@Before
public void setup() throws Exception {
    conf = new Configuration();
    long currTime = System.currentTimeMillis();
    Path outputDir = new Path("/tmp/" + currTime);
    FileSystem fs;// ww  w .  jav a 2  s .  c o m
    try {
        fs = FileSystem.get(outputDir.toUri(), conf);
        fs.mkdirs(outputDir);
        fs.deleteOnExit(outputDir);
    } catch (IOException e) {
        e.printStackTrace();
        Assert.fail("Error in creating output direcoty " + outputDir);
        return;
    }
    ym = computeMean(inputVectors);
    double[] xm = new double[xsize];
    times(ym, y2xVectors, xm);
    double[] zm = new double[cols];
    timesTranspose(xm, cVectors, zm);
    for (int c = 0; c < cols; c++)
        zm[c] -= ym[c];
    ymPath = PCACommon.toDistributedVector(new DenseVector(ym), outputDir, "ym", conf);
    zmPath = PCACommon.toDistributedVector(new DenseVector(zm), outputDir, "zm", conf);
    DistributedRowMatrix distMatrix = PCACommon.toDistributedRowMatrix(new DenseMatrix(y2xVectors), outputDir,
            outputDir, "y2xMatrix");
    y2xMatrixPath = distMatrix.getRowPath();
    distMatrix = PCACommon.toDistributedRowMatrix(new DenseMatrix(cVectors), outputDir, outputDir, "cMatrix");
    cMatrixPath = distMatrix.getRowPath();
    computeError(inputVectors);
}

From source file:org.qcri.pca.VarianceJobTest.java

License:Apache License

@Before
public void setup() throws Exception {
    conf = new Configuration();
    long currTime = System.currentTimeMillis();
    Path outputDir = new Path("/tmp/" + currTime);
    FileSystem fs;/* w  w w  .  jav a2 s. c o  m*/
    try {
        fs = FileSystem.get(outputDir.toUri(), conf);
        fs.mkdirs(outputDir);
        fs.deleteOnExit(outputDir);
    } catch (IOException e) {
        e.printStackTrace();
        Assert.fail("Error in creating output direcoty " + outputDir);
        return;
    }
    ym = computeMean(inputVectors);
    double[] xm = new double[xsize];
    times(ym, y2xVectors, xm);
    ymPath = PCACommon.toDistributedVector(new DenseVector(ym), outputDir, "ym", conf);
    xmPath = PCACommon.toDistributedVector(new DenseVector(xm), outputDir, "xm", conf);
    DistributedRowMatrix distMatrix = PCACommon.toDistributedRowMatrix(new DenseMatrix(y2xVectors), outputDir,
            outputDir, "y2xMatrix");
    y2xMatrixPath = distMatrix.getRowPath();
    distMatrix = PCACommon.toDistributedRowMatrix(new DenseMatrix(cVectors), outputDir, outputDir, "cMatrix");
    cMatrixPath = distMatrix.getRowPath();
}

From source file:org.qcri.sparkpca.SparkPCA.java

/**
 * Compute principal component analysis where the input is an RDD<org.apache.spark.mllib.linalg.Vector> of vectors such that each vector represents a row in the matrix
 * @param sc /*  w  ww.  j  a  v a 2 s.  c o  m*/
 *           Spark context that contains the configuration parameters and represents connection to the cluster 
 *          (used to create RDDs, accumulators and broadcast variables on that cluster)
 * @param vectors
 *          An RDD of vectors representing the rows of the input matrix
 * @param nRows
 *          Number of rows in input Matrix
 * @param nCols
 *          Number of columns in input Matrix
 * @param nPCs
 *          Number of desired principal components
 * @param errRate
 *          The sampling rate that is used for computing the reconstruction error
 * @param maxIterations 
 *          Maximum number of iterations before terminating
 * @return Matrix of size nCols X nPCs having the desired principal components
 */
public static org.apache.spark.mllib.linalg.Matrix computePrincipalComponents(JavaSparkContext sc,
        JavaRDD<org.apache.spark.mllib.linalg.Vector> vectors, String outputPath, final int nRows,
        final int nCols, final int nPCs, final double errRate, final int maxIterations,
        final int computeProjectedMatrix) {

    System.out.println("Rows: " + nRows + ", Columns " + nCols);
    double ss;
    Matrix centralC;

    // The two PPCA variables that improve over each iteration: Principal component matrix (C) and random variance (ss).
    // Initialized randomly for the first iteration
    ss = PCAUtils.randSS();
    centralC = PCAUtils.randomMatrix(nCols, nPCs);

    // 1. Mean Job : This job calculates the mean and span of the columns of the input RDD<org.apache.spark.mllib.linalg.Vector>
    final Accumulator<double[]> matrixAccumY = sc.accumulator(new double[nCols], new VectorAccumulatorParam());
    final double[] internalSumY = new double[nCols];
    vectors.foreachPartition(new VoidFunction<Iterator<org.apache.spark.mllib.linalg.Vector>>() {

        public void call(Iterator<org.apache.spark.mllib.linalg.Vector> arg0) throws Exception {
            org.apache.spark.mllib.linalg.Vector yi;
            int[] indices = null;
            int i;
            while (arg0.hasNext()) {
                yi = arg0.next();
                indices = ((SparseVector) yi).indices();
                for (i = 0; i < indices.length; i++) {
                    internalSumY[indices[i]] += yi.apply(indices[i]);
                }
            }
            matrixAccumY.add(internalSumY);
        }

    });//End Mean Job

    //Get the sum of column Vector from the accumulator and divide each element by the number of rows to get the mean
    final Vector meanVector = new DenseVector(matrixAccumY.value()).divide(nRows);
    final Broadcast<Vector> br_ym_mahout = sc.broadcast(meanVector);

    //2. Frobenious Norm Job : Obtain Frobenius norm of the input RDD<org.apache.spark.mllib.linalg.Vector>
    /**
     * To compute the norm2 of a sparse matrix, iterate over sparse items and sum
     * square of the difference. After processing each row, add the sum of the
     * meanSquare of the zero-elements that were ignored in the sparse iteration.
    */
    double meanSquareSumTmp = 0;
    for (int i = 0; i < br_ym_mahout.value().size(); i++) {
        double element = br_ym_mahout.value().getQuick(i);
        meanSquareSumTmp += element * element;
    }
    final double meanSquareSum = meanSquareSumTmp;
    final Accumulator<Double> doubleAccumNorm2 = sc.accumulator(0.0);
    vectors.foreach(new VoidFunction<org.apache.spark.mllib.linalg.Vector>() {

        public void call(org.apache.spark.mllib.linalg.Vector yi) throws Exception {
            double norm2 = 0;
            double meanSquareSumOfZeroElements = meanSquareSum;
            int[] indices = ((SparseVector) yi).indices();
            int i;
            int index;
            double v;
            //iterate over non
            for (i = 0; i < indices.length; i++) {
                index = indices[i];
                v = yi.apply(index);
                double mean = br_ym_mahout.value().getQuick(index);
                double diff = v - mean;
                diff *= diff;

                // cancel the effect of the non-zero element in meanSquareSum
                meanSquareSumOfZeroElements -= mean * mean;
                norm2 += diff;
            }
            // For all all zero items, the following has the sum of mean square
            norm2 += meanSquareSumOfZeroElements;
            doubleAccumNorm2.add(norm2);
        }

    });//end Frobenious Norm Job

    double norm2 = doubleAccumNorm2.value();
    log.info("NOOOOORM2=" + norm2);

    // initial CtC  
    Matrix centralCtC = centralC.transpose().times(centralC);

    Matrix centralY2X = null;
    // -------------------------- EM Iterations
    // while count
    int round = 0;
    double prevObjective = Double.MAX_VALUE;
    double error = 0;
    double relChangeInObjective = Double.MAX_VALUE;
    double prevError = Double.MAX_VALUE;
    final float threshold = 0.00001f;
    final int firstRound = round;

    for (; (round < maxIterations && relChangeInObjective > threshold && prevError > 0.02); round++) {

        // Sx = inv( ss * eye(d) + CtC );
        Matrix centralSx = centralCtC.clone();

        // Sx = inv( eye(d) + CtC/ss );
        centralSx.viewDiagonal().assign(Functions.plus(ss));
        centralSx = PCAUtils.inv(centralSx);

        // X = Y * C * Sx' => Y2X = C * Sx'
        centralY2X = centralC.times(centralSx.transpose());

        //Broadcast Y2X because it will be used in several jobs and several iterations.
        final Broadcast<Matrix> br_centralY2X = sc.broadcast(centralY2X);

        // Xm = Ym * Y2X
        Vector xm_mahout = new DenseVector(nPCs);
        xm_mahout = PCAUtils.denseVectorTimesMatrix(br_ym_mahout.value(), centralY2X, xm_mahout);

        //Broadcast Xm because it will be used in several iterations.
        final Broadcast<Vector> br_xm_mahout = sc.broadcast(xm_mahout);
        // We skip computing X as we generate it on demand using Y and Y2X

        // 3. X'X and Y'X Job:  The job computes the two matrices X'X and Y'X
        /**
         * Xc = Yc * MEM   (MEM is the in-memory broadcasted matrix Y2X)
         * 
         * XtX = Xc' * Xc
         * 
         * YtX = Yc' * Xc
         * 
         * It also considers that Y is sparse and receives the mean vectors Ym and Xm
         * separately.
         * 
         * Yc = Y - Ym
         * 
         * Xc = X - Xm
         * 
         * Xc = (Y - Ym) * MEM = Y * MEM - Ym * MEM = X - Xm
         * 
         * XtX = (X - Xm)' * (X - Xm)
         * 
         * YtX = (Y - Ym)' * (X - Xm)
         * 
        */
        final Accumulator<double[][]> matrixAccumXtx = sc.accumulator(new double[nPCs][nPCs],
                new MatrixAccumulatorParam());
        final Accumulator<double[][]> matrixAccumYtx = sc.accumulator(new double[nCols][nPCs],
                new MatrixAccumulatorParam());
        final Accumulator<double[]> matrixAccumX = sc.accumulator(new double[nPCs],
                new VectorAccumulatorParam());

        /*
         * Initialize the output matrices and vectors once in order to avoid generating massive intermediate data in the workers
         */
        final double[][] resArrayYtX = new double[nCols][nPCs];
        final double[][] resArrayXtX = new double[nPCs][nPCs];
        final double[] resArrayX = new double[nPCs];

        /*
         * Used to sum the vectors in one partition.
        */
        final double[][] internalSumYtX = new double[nCols][nPCs];
        final double[][] internalSumXtX = new double[nPCs][nPCs];
        final double[] internalSumX = new double[nPCs];

        vectors.foreachPartition(new VoidFunction<Iterator<org.apache.spark.mllib.linalg.Vector>>() {

            public void call(Iterator<org.apache.spark.mllib.linalg.Vector> arg0) throws Exception {
                org.apache.spark.mllib.linalg.Vector yi;
                while (arg0.hasNext()) {
                    yi = arg0.next();

                    /*
                      * Perform in-memory matrix multiplication xi = yi' * Y2X
                     */
                    PCAUtils.sparseVectorTimesMatrix(yi, br_centralY2X.value(), resArrayX);

                    //get only the sparse indices
                    int[] indices = ((SparseVector) yi).indices();

                    PCAUtils.outerProductWithIndices(yi, br_ym_mahout.value(), resArrayX, br_xm_mahout.value(),
                            resArrayYtX, indices);
                    PCAUtils.outerProductArrayInput(resArrayX, br_xm_mahout.value(), resArrayX,
                            br_xm_mahout.value(), resArrayXtX);
                    int i, j, rowIndexYtX;

                    //add the sparse indices only
                    for (i = 0; i < indices.length; i++) {
                        rowIndexYtX = indices[i];
                        for (j = 0; j < nPCs; j++) {
                            internalSumYtX[rowIndexYtX][j] += resArrayYtX[rowIndexYtX][j];
                            resArrayYtX[rowIndexYtX][j] = 0; //reset it
                        }

                    }
                    for (i = 0; i < nPCs; i++) {
                        internalSumX[i] += resArrayX[i];
                        for (j = 0; j < nPCs; j++) {
                            internalSumXtX[i][j] += resArrayXtX[i][j];
                            resArrayXtX[i][j] = 0; //reset it
                        }

                    }
                }
                matrixAccumX.add(internalSumX);
                matrixAccumXtx.add(internalSumXtX);
                matrixAccumYtx.add(internalSumYtX);
            }

        });//end  X'X and Y'X Job

        /*
         * Get the values of the accumulators.
        */
        Matrix centralYtX = new DenseMatrix(matrixAccumYtx.value());
        Matrix centralXtX = new DenseMatrix(matrixAccumXtx.value());
        Vector centralSumX = new DenseVector(matrixAccumX.value());

        /*
          * Mi = (Yi-Ym)' x (Xi-Xm) = Yi' x (Xi-Xm) - Ym' x (Xi-Xm)
          * 
          * M = Sum(Mi) = Sum(Yi' x (Xi-Xm)) - Ym' x (Sum(Xi)-N*Xm)
          * 
          * The first part is done in the previous job and the second in the following method
         */
        centralYtX = PCAUtils.updateXtXAndYtx(centralYtX, centralSumX, br_ym_mahout.value(), xm_mahout, nRows);
        centralXtX = PCAUtils.updateXtXAndYtx(centralXtX, centralSumX, xm_mahout, xm_mahout, nRows);

        // XtX = X'*X + ss * Sx
        final double finalss = ss;
        centralXtX.assign(centralSx, new DoubleDoubleFunction() {
            @Override
            public double apply(double arg1, double arg2) {
                return arg1 + finalss * arg2;
            }
        });

        // C = (Ye'*X) / SumXtX;
        Matrix invXtX_central = PCAUtils.inv(centralXtX);
        centralC = centralYtX.times(invXtX_central);
        centralCtC = centralC.transpose().times(centralC);

        // Compute new value for ss
        // ss = ( sum(sum(Ye.^2)) + trace(XtX*CtC) - 2sum(XiCtYit))/(N*D);

        //4. Variance Job: Computes part of variance that requires a distributed job
        /**
         * xcty = Sum (xi * C' * yi')
         * 
         * We also regenerate xi on demand by the following formula:
         * 
         * xi = yi * y2x
         * 
         * To make it efficient for sparse matrices that are not mean-centered, we receive the mean
         * separately:
         * 
         * xi = (yi - ym) * y2x = yi * y2x - xm, where xm = ym*y2x
         * 
         * xi * C' * (yi-ym)' = xi * ((yi-ym)*C)' = xi * (yi*C - ym*C)'
         * 
         */

        double ss2 = PCAUtils.trace(centralXtX.times(centralCtC));
        final double[] resArrayYmC = new double[centralC.numCols()];
        PCAUtils.denseVectorTimesMatrix(meanVector, centralC, resArrayYmC);
        final Broadcast<Matrix> br_centralC = sc.broadcast(centralC);
        final double[] resArrayYiC = new double[centralC.numCols()];
        final Accumulator<Double> doubleAccumXctyt = sc.accumulator(0.0);
        vectors.foreach(new VoidFunction<org.apache.spark.mllib.linalg.Vector>() {
            public void call(org.apache.spark.mllib.linalg.Vector yi) throws Exception {
                PCAUtils.sparseVectorTimesMatrix(yi, br_centralY2X.value(), resArrayX);
                PCAUtils.sparseVectorTimesMatrix(yi, br_centralC.value(), resArrayYiC);

                PCAUtils.subtract(resArrayYiC, resArrayYmC);
                double dotRes = PCAUtils.dot(resArrayX, resArrayYiC);
                doubleAccumXctyt.add(dotRes);
            }
        }); //end Variance Job

        double xctyt = doubleAccumXctyt.value();
        ss = (norm2 + ss2 - 2 * xctyt) / (nRows * nCols);
        log.info("SSSSSSSSSSSSSSSSSSSSSSSSSSSS " + ss + " (" + norm2 + " + " + ss2 + " -2* " + xctyt);

        double objective = ss;
        relChangeInObjective = Math.abs(1 - objective / prevObjective);
        prevObjective = objective;
        log.info("Objective:  %.6f    relative change: %.6f \n", objective, relChangeInObjective);

        if (!CALCULATE_ERR_ATTHEEND) {
            log.info("Computing the error at round " + round + " ...");
            System.out.println("Computing the error at round " + round + " ...");

            double[] resArrayZmTmp = new double[centralC.numRows()];
            PCAUtils.vectorTimesMatrixTranspose(xm_mahout, centralC, resArrayZmTmp);
            final double[] resArrayZm = PCAUtils.subtractVectorArray(resArrayZmTmp, meanVector);

            //5. Reconstruction Error Job: The job computes the reconstruction error of the input matrix after updating the principal 
            //components matrix
            /**
             * Xc = Yc * Y2X
             * 
             * ReconY = Xc * C'
             * 
             * Err = ReconY - Yc
             * 
             * Norm2(Err) = abs(Err).zSum().max()
             * 
             * To take the sparse matrixes into account we receive the mean separately:
             * 
             * X = (Y - Ym) * Y2X = X - Xm, where X=Y*Y2X and Xm=Ym*Y2X
             * 
             * ReconY = (X - Xm) * C' = X*C' - Xm*C'
             * 
             * Err = X*C' - Xm*C' - (Y - Ym) = X*C' - Y - Zm, where where Zm=Xm*C'-Ym
             * 
             */

            final Accumulator<double[]> vectorAccumErr = sc.accumulator(new double[nCols],
                    new VectorAccumulatorAbsParam());
            final Accumulator<double[]> vectorAccumNormCentralized = sc.accumulator(new double[nCols],
                    new VectorAccumulatorAbsParam());
            final double[] xiCtArray = new double[centralC.numRows()];
            final double[] normalizedArray = new double[nCols];

            vectors.foreach(new VoidFunction<org.apache.spark.mllib.linalg.Vector>() {
                public void call(org.apache.spark.mllib.linalg.Vector yi) throws Exception {
                    if (PCAUtils.pass(errRate))
                        return;

                    PCAUtils.sparseVectorTimesMatrix(yi, br_centralY2X.value(), resArrayX);
                    PCAUtils.vectorTimesMatrixTranspose(resArrayX, br_centralC.value(), xiCtArray);

                    PCAUtils.denseVectorSubtractSparseSubtractDense(xiCtArray, yi, resArrayZm);
                    vectorAccumErr.add(xiCtArray);

                    PCAUtils.denseVectorMinusVector(yi, br_ym_mahout.value(), normalizedArray);
                    vectorAccumNormCentralized.add(normalizedArray);
                }
            }); // end Reconstruction Error Job

            /*
             * The norm of the reconstruction error matrix
             */
            double reconstructionError = PCAUtils.getMax(vectorAccumErr.value());
            log.info("************************ReconsructionError=" + reconstructionError);

            /*
             * The norm of the input matrix after centralization
            */
            double centralizedYNorm = PCAUtils.getMax(vectorAccumNormCentralized.value());
            log.info("************************CentralizedNOrm=" + centralizedYNorm);

            error = reconstructionError / centralizedYNorm;
            log.info("... end of computing the error at round " + round + " And error=" + error);

            prevError = error;
        }
    }
    if (computeProjectedMatrix == 1) {
        //7. Compute Projected Matrix Job: The job multiplies the input matrix Y with the principal components C to get the Projected vectors 
        final Broadcast<Matrix> br_centralC = sc.broadcast(centralC);
        JavaRDD<org.apache.spark.mllib.linalg.Vector> projectedVectors = vectors.map(
                new Function<org.apache.spark.mllib.linalg.Vector, org.apache.spark.mllib.linalg.Vector>() {

                    public org.apache.spark.mllib.linalg.Vector call(org.apache.spark.mllib.linalg.Vector yi)
                            throws Exception {

                        return PCAUtils.sparseVectorTimesMatrix(yi, br_centralC.value());
                    }
                });//End Compute Projected Matrix
        String path = outputPath + File.separator + "ProjectedMatrix";
        projectedVectors.saveAsTextFile(path);
    }
    return PCAUtils.convertMahoutToSparkMatrix(centralC);

}