Example usage for org.apache.mahout.math DenseMatrix DenseMatrix

List of usage examples for org.apache.mahout.math DenseMatrix DenseMatrix

Introduction

In this page you can find the example usage for org.apache.mahout.math DenseMatrix DenseMatrix.

Prototype

public DenseMatrix(double[][] values) 

Source Link

Document

Construct a matrix from the given values

Usage

From source file:com.cloudera.science.ml.parallel.covariance.MahalanobisDistance.java

License:Open Source License

public void initialize() {
    if (m == null) {
        this.m = new DenseVector(means);
        this.ci = new DenseMatrix(covInv);
    }/*from   w w  w .  j ava 2 s . co m*/
}

From source file:com.twitter.algebra.AlgebraCommon.java

License:Apache License

/**
 * Convert a 2-dimensional array to a dense matrix in {@link MapDir} format
 * @param vectors a 2-dimensional array of doubles
 * @param outPath the path to which the dense matrix will be written
 * @param tmpPath an argument required to be passed to {@link DistributedRowMatrix}
 * @param label a unique label to name the output matrix directory
 * @return a {@link DistributedRowMatrix} pointing to the in-filesystem matrix
 * @throws Exception/*from   w  ww .j  ava2  s .  c  om*/
 */
public static DistributedRowMatrix toDenseMapDir(double[][] vectors, Path outPath, Path tmpPath, String label)
        throws Exception {
    DenseMatrix m = new DenseMatrix(vectors);
    return AlgebraCommon.toMapDir(m, outPath, tmpPath, label);
}

From source file:org.carrot2.matrix.factorization.PartialSingularValueDecomposition.java

License:Open Source License

public void compute() {
    // Use Colt's SVD
    SingularValueDecomposition svd;/*  www .  ja  va2 s  . com*/
    if (A.columns() > A.rows()) {
        svd = new SingularValueDecomposition(new DenseMatrix(A.viewDice().toArray()));
        V = toColtMatrix(svd.getU());
        U = toColtMatrix(svd.getV());
    } else {
        svd = new SingularValueDecomposition(new DenseMatrix(A.toArray()));
        U = toColtMatrix(svd.getU());
        V = toColtMatrix(svd.getV());
    }

    S = svd.getSingularValues();

    if (k > 0 && k < S.length) {
        U = U.viewPart(0, 0, U.rows(), k);
        V = V.viewPart(0, 0, V.rows(), k);
        S = Arrays.copyOf(S, k);
    }
}

From source file:org.qcri.pca.CompositeJobTest.java

License:Apache License

@Before
public void setup() throws Exception {
    conf = new Configuration();
    long currTime = System.currentTimeMillis();
    Path outputDir = new Path("/tmp/" + currTime);
    FileSystem fs;/*from  w w w. j a v a 2s . c om*/
    try {
        fs = FileSystem.get(outputDir.toUri(), conf);
        fs.mkdirs(outputDir);
        fs.deleteOnExit(outputDir);
    } catch (IOException e) {
        e.printStackTrace();
        Assert.fail("Error in creating output direcoty " + outputDir);
        return;
    }
    ym = computeMean(inputVectors);
    double[] xm = new double[xsize];
    times(ym, inMemMatrix, xm);
    ymPath = PCACommon.toDistributedVector(new DenseVector(ym), outputDir, "ym", conf);
    xmPath = PCACommon.toDistributedVector(new DenseVector(xm), outputDir, "xm", conf);
    DistributedRowMatrix distMatrix = PCACommon.toDistributedRowMatrix(new DenseMatrix(inMemMatrix), outputDir,
            outputDir, "inMemMatrix");
    inMemMatrixPath = distMatrix.getRowPath();
    for (double[] row : xtx)
        for (int c = 0; c < row.length; c++)
            row[c] = 0;
    for (double[] row : ytx)
        for (int c = 0; c < row.length; c++)
            row[c] = 0;
    computeXtXandYtX(inputVectors);
}

From source file:org.qcri.pca.PCACommon.java

/** A hard-coded initialization matrix used for validation
  * //from   w ww  .  java2 s . co m
  * @param rows
  * @param cols
  * @return
*/

static Matrix randomValidationMatrix(int rows, int cols) {
    double randomArray[][] = { { 0.730967787376657, 0.24053641567148587, 0.6374174253501083 },
            { 0.5504370051176339, 0.5975452777972018, 0.3332183994766498 },
            { 0.3851891847407185, 0.984841540199809, 0.8791825178724801 },
            { 0.9412491794821144, 0.27495396603548483, 0.12889715087377673 },
            { 0.14660165764651822, 0.023238122483889456, 0.5467397571984656 } };
    DenseMatrix matrix = new DenseMatrix(randomArray);
    return matrix;
}

From source file:org.qcri.pca.ReconstructionErrJobTest.java

License:Apache License

@Before
public void setup() throws Exception {
    conf = new Configuration();
    long currTime = System.currentTimeMillis();
    Path outputDir = new Path("/tmp/" + currTime);
    FileSystem fs;// ww  w .  jav a 2  s .  c o m
    try {
        fs = FileSystem.get(outputDir.toUri(), conf);
        fs.mkdirs(outputDir);
        fs.deleteOnExit(outputDir);
    } catch (IOException e) {
        e.printStackTrace();
        Assert.fail("Error in creating output direcoty " + outputDir);
        return;
    }
    ym = computeMean(inputVectors);
    double[] xm = new double[xsize];
    times(ym, y2xVectors, xm);
    double[] zm = new double[cols];
    timesTranspose(xm, cVectors, zm);
    for (int c = 0; c < cols; c++)
        zm[c] -= ym[c];
    ymPath = PCACommon.toDistributedVector(new DenseVector(ym), outputDir, "ym", conf);
    zmPath = PCACommon.toDistributedVector(new DenseVector(zm), outputDir, "zm", conf);
    DistributedRowMatrix distMatrix = PCACommon.toDistributedRowMatrix(new DenseMatrix(y2xVectors), outputDir,
            outputDir, "y2xMatrix");
    y2xMatrixPath = distMatrix.getRowPath();
    distMatrix = PCACommon.toDistributedRowMatrix(new DenseMatrix(cVectors), outputDir, outputDir, "cMatrix");
    cMatrixPath = distMatrix.getRowPath();
    computeError(inputVectors);
}

From source file:org.qcri.pca.VarianceJobTest.java

License:Apache License

@Before
public void setup() throws Exception {
    conf = new Configuration();
    long currTime = System.currentTimeMillis();
    Path outputDir = new Path("/tmp/" + currTime);
    FileSystem fs;/* w  w w  .  jav a2 s. c o  m*/
    try {
        fs = FileSystem.get(outputDir.toUri(), conf);
        fs.mkdirs(outputDir);
        fs.deleteOnExit(outputDir);
    } catch (IOException e) {
        e.printStackTrace();
        Assert.fail("Error in creating output direcoty " + outputDir);
        return;
    }
    ym = computeMean(inputVectors);
    double[] xm = new double[xsize];
    times(ym, y2xVectors, xm);
    ymPath = PCACommon.toDistributedVector(new DenseVector(ym), outputDir, "ym", conf);
    xmPath = PCACommon.toDistributedVector(new DenseVector(xm), outputDir, "xm", conf);
    DistributedRowMatrix distMatrix = PCACommon.toDistributedRowMatrix(new DenseMatrix(y2xVectors), outputDir,
            outputDir, "y2xMatrix");
    y2xMatrixPath = distMatrix.getRowPath();
    distMatrix = PCACommon.toDistributedRowMatrix(new DenseMatrix(cVectors), outputDir, outputDir, "cMatrix");
    cMatrixPath = distMatrix.getRowPath();
}

From source file:org.qcri.sparkpca.SparkPCA.java

/**
 * Compute principal component analysis where the input is an RDD<org.apache.spark.mllib.linalg.Vector> of vectors such that each vector represents a row in the matrix
 * @param sc /*  w  ww.  j  a  v a 2 s.  c o  m*/
 *           Spark context that contains the configuration parameters and represents connection to the cluster 
 *          (used to create RDDs, accumulators and broadcast variables on that cluster)
 * @param vectors
 *          An RDD of vectors representing the rows of the input matrix
 * @param nRows
 *          Number of rows in input Matrix
 * @param nCols
 *          Number of columns in input Matrix
 * @param nPCs
 *          Number of desired principal components
 * @param errRate
 *          The sampling rate that is used for computing the reconstruction error
 * @param maxIterations 
 *          Maximum number of iterations before terminating
 * @return Matrix of size nCols X nPCs having the desired principal components
 */
public static org.apache.spark.mllib.linalg.Matrix computePrincipalComponents(JavaSparkContext sc,
        JavaRDD<org.apache.spark.mllib.linalg.Vector> vectors, String outputPath, final int nRows,
        final int nCols, final int nPCs, final double errRate, final int maxIterations,
        final int computeProjectedMatrix) {

    System.out.println("Rows: " + nRows + ", Columns " + nCols);
    double ss;
    Matrix centralC;

    // The two PPCA variables that improve over each iteration: Principal component matrix (C) and random variance (ss).
    // Initialized randomly for the first iteration
    ss = PCAUtils.randSS();
    centralC = PCAUtils.randomMatrix(nCols, nPCs);

    // 1. Mean Job : This job calculates the mean and span of the columns of the input RDD<org.apache.spark.mllib.linalg.Vector>
    final Accumulator<double[]> matrixAccumY = sc.accumulator(new double[nCols], new VectorAccumulatorParam());
    final double[] internalSumY = new double[nCols];
    vectors.foreachPartition(new VoidFunction<Iterator<org.apache.spark.mllib.linalg.Vector>>() {

        public void call(Iterator<org.apache.spark.mllib.linalg.Vector> arg0) throws Exception {
            org.apache.spark.mllib.linalg.Vector yi;
            int[] indices = null;
            int i;
            while (arg0.hasNext()) {
                yi = arg0.next();
                indices = ((SparseVector) yi).indices();
                for (i = 0; i < indices.length; i++) {
                    internalSumY[indices[i]] += yi.apply(indices[i]);
                }
            }
            matrixAccumY.add(internalSumY);
        }

    });//End Mean Job

    //Get the sum of column Vector from the accumulator and divide each element by the number of rows to get the mean
    final Vector meanVector = new DenseVector(matrixAccumY.value()).divide(nRows);
    final Broadcast<Vector> br_ym_mahout = sc.broadcast(meanVector);

    //2. Frobenious Norm Job : Obtain Frobenius norm of the input RDD<org.apache.spark.mllib.linalg.Vector>
    /**
     * To compute the norm2 of a sparse matrix, iterate over sparse items and sum
     * square of the difference. After processing each row, add the sum of the
     * meanSquare of the zero-elements that were ignored in the sparse iteration.
    */
    double meanSquareSumTmp = 0;
    for (int i = 0; i < br_ym_mahout.value().size(); i++) {
        double element = br_ym_mahout.value().getQuick(i);
        meanSquareSumTmp += element * element;
    }
    final double meanSquareSum = meanSquareSumTmp;
    final Accumulator<Double> doubleAccumNorm2 = sc.accumulator(0.0);
    vectors.foreach(new VoidFunction<org.apache.spark.mllib.linalg.Vector>() {

        public void call(org.apache.spark.mllib.linalg.Vector yi) throws Exception {
            double norm2 = 0;
            double meanSquareSumOfZeroElements = meanSquareSum;
            int[] indices = ((SparseVector) yi).indices();
            int i;
            int index;
            double v;
            //iterate over non
            for (i = 0; i < indices.length; i++) {
                index = indices[i];
                v = yi.apply(index);
                double mean = br_ym_mahout.value().getQuick(index);
                double diff = v - mean;
                diff *= diff;

                // cancel the effect of the non-zero element in meanSquareSum
                meanSquareSumOfZeroElements -= mean * mean;
                norm2 += diff;
            }
            // For all all zero items, the following has the sum of mean square
            norm2 += meanSquareSumOfZeroElements;
            doubleAccumNorm2.add(norm2);
        }

    });//end Frobenious Norm Job

    double norm2 = doubleAccumNorm2.value();
    log.info("NOOOOORM2=" + norm2);

    // initial CtC  
    Matrix centralCtC = centralC.transpose().times(centralC);

    Matrix centralY2X = null;
    // -------------------------- EM Iterations
    // while count
    int round = 0;
    double prevObjective = Double.MAX_VALUE;
    double error = 0;
    double relChangeInObjective = Double.MAX_VALUE;
    double prevError = Double.MAX_VALUE;
    final float threshold = 0.00001f;
    final int firstRound = round;

    for (; (round < maxIterations && relChangeInObjective > threshold && prevError > 0.02); round++) {

        // Sx = inv( ss * eye(d) + CtC );
        Matrix centralSx = centralCtC.clone();

        // Sx = inv( eye(d) + CtC/ss );
        centralSx.viewDiagonal().assign(Functions.plus(ss));
        centralSx = PCAUtils.inv(centralSx);

        // X = Y * C * Sx' => Y2X = C * Sx'
        centralY2X = centralC.times(centralSx.transpose());

        //Broadcast Y2X because it will be used in several jobs and several iterations.
        final Broadcast<Matrix> br_centralY2X = sc.broadcast(centralY2X);

        // Xm = Ym * Y2X
        Vector xm_mahout = new DenseVector(nPCs);
        xm_mahout = PCAUtils.denseVectorTimesMatrix(br_ym_mahout.value(), centralY2X, xm_mahout);

        //Broadcast Xm because it will be used in several iterations.
        final Broadcast<Vector> br_xm_mahout = sc.broadcast(xm_mahout);
        // We skip computing X as we generate it on demand using Y and Y2X

        // 3. X'X and Y'X Job:  The job computes the two matrices X'X and Y'X
        /**
         * Xc = Yc * MEM   (MEM is the in-memory broadcasted matrix Y2X)
         * 
         * XtX = Xc' * Xc
         * 
         * YtX = Yc' * Xc
         * 
         * It also considers that Y is sparse and receives the mean vectors Ym and Xm
         * separately.
         * 
         * Yc = Y - Ym
         * 
         * Xc = X - Xm
         * 
         * Xc = (Y - Ym) * MEM = Y * MEM - Ym * MEM = X - Xm
         * 
         * XtX = (X - Xm)' * (X - Xm)
         * 
         * YtX = (Y - Ym)' * (X - Xm)
         * 
        */
        final Accumulator<double[][]> matrixAccumXtx = sc.accumulator(new double[nPCs][nPCs],
                new MatrixAccumulatorParam());
        final Accumulator<double[][]> matrixAccumYtx = sc.accumulator(new double[nCols][nPCs],
                new MatrixAccumulatorParam());
        final Accumulator<double[]> matrixAccumX = sc.accumulator(new double[nPCs],
                new VectorAccumulatorParam());

        /*
         * Initialize the output matrices and vectors once in order to avoid generating massive intermediate data in the workers
         */
        final double[][] resArrayYtX = new double[nCols][nPCs];
        final double[][] resArrayXtX = new double[nPCs][nPCs];
        final double[] resArrayX = new double[nPCs];

        /*
         * Used to sum the vectors in one partition.
        */
        final double[][] internalSumYtX = new double[nCols][nPCs];
        final double[][] internalSumXtX = new double[nPCs][nPCs];
        final double[] internalSumX = new double[nPCs];

        vectors.foreachPartition(new VoidFunction<Iterator<org.apache.spark.mllib.linalg.Vector>>() {

            public void call(Iterator<org.apache.spark.mllib.linalg.Vector> arg0) throws Exception {
                org.apache.spark.mllib.linalg.Vector yi;
                while (arg0.hasNext()) {
                    yi = arg0.next();

                    /*
                      * Perform in-memory matrix multiplication xi = yi' * Y2X
                     */
                    PCAUtils.sparseVectorTimesMatrix(yi, br_centralY2X.value(), resArrayX);

                    //get only the sparse indices
                    int[] indices = ((SparseVector) yi).indices();

                    PCAUtils.outerProductWithIndices(yi, br_ym_mahout.value(), resArrayX, br_xm_mahout.value(),
                            resArrayYtX, indices);
                    PCAUtils.outerProductArrayInput(resArrayX, br_xm_mahout.value(), resArrayX,
                            br_xm_mahout.value(), resArrayXtX);
                    int i, j, rowIndexYtX;

                    //add the sparse indices only
                    for (i = 0; i < indices.length; i++) {
                        rowIndexYtX = indices[i];
                        for (j = 0; j < nPCs; j++) {
                            internalSumYtX[rowIndexYtX][j] += resArrayYtX[rowIndexYtX][j];
                            resArrayYtX[rowIndexYtX][j] = 0; //reset it
                        }

                    }
                    for (i = 0; i < nPCs; i++) {
                        internalSumX[i] += resArrayX[i];
                        for (j = 0; j < nPCs; j++) {
                            internalSumXtX[i][j] += resArrayXtX[i][j];
                            resArrayXtX[i][j] = 0; //reset it
                        }

                    }
                }
                matrixAccumX.add(internalSumX);
                matrixAccumXtx.add(internalSumXtX);
                matrixAccumYtx.add(internalSumYtX);
            }

        });//end  X'X and Y'X Job

        /*
         * Get the values of the accumulators.
        */
        Matrix centralYtX = new DenseMatrix(matrixAccumYtx.value());
        Matrix centralXtX = new DenseMatrix(matrixAccumXtx.value());
        Vector centralSumX = new DenseVector(matrixAccumX.value());

        /*
          * Mi = (Yi-Ym)' x (Xi-Xm) = Yi' x (Xi-Xm) - Ym' x (Xi-Xm)
          * 
          * M = Sum(Mi) = Sum(Yi' x (Xi-Xm)) - Ym' x (Sum(Xi)-N*Xm)
          * 
          * The first part is done in the previous job and the second in the following method
         */
        centralYtX = PCAUtils.updateXtXAndYtx(centralYtX, centralSumX, br_ym_mahout.value(), xm_mahout, nRows);
        centralXtX = PCAUtils.updateXtXAndYtx(centralXtX, centralSumX, xm_mahout, xm_mahout, nRows);

        // XtX = X'*X + ss * Sx
        final double finalss = ss;
        centralXtX.assign(centralSx, new DoubleDoubleFunction() {
            @Override
            public double apply(double arg1, double arg2) {
                return arg1 + finalss * arg2;
            }
        });

        // C = (Ye'*X) / SumXtX;
        Matrix invXtX_central = PCAUtils.inv(centralXtX);
        centralC = centralYtX.times(invXtX_central);
        centralCtC = centralC.transpose().times(centralC);

        // Compute new value for ss
        // ss = ( sum(sum(Ye.^2)) + trace(XtX*CtC) - 2sum(XiCtYit))/(N*D);

        //4. Variance Job: Computes part of variance that requires a distributed job
        /**
         * xcty = Sum (xi * C' * yi')
         * 
         * We also regenerate xi on demand by the following formula:
         * 
         * xi = yi * y2x
         * 
         * To make it efficient for sparse matrices that are not mean-centered, we receive the mean
         * separately:
         * 
         * xi = (yi - ym) * y2x = yi * y2x - xm, where xm = ym*y2x
         * 
         * xi * C' * (yi-ym)' = xi * ((yi-ym)*C)' = xi * (yi*C - ym*C)'
         * 
         */

        double ss2 = PCAUtils.trace(centralXtX.times(centralCtC));
        final double[] resArrayYmC = new double[centralC.numCols()];
        PCAUtils.denseVectorTimesMatrix(meanVector, centralC, resArrayYmC);
        final Broadcast<Matrix> br_centralC = sc.broadcast(centralC);
        final double[] resArrayYiC = new double[centralC.numCols()];
        final Accumulator<Double> doubleAccumXctyt = sc.accumulator(0.0);
        vectors.foreach(new VoidFunction<org.apache.spark.mllib.linalg.Vector>() {
            public void call(org.apache.spark.mllib.linalg.Vector yi) throws Exception {
                PCAUtils.sparseVectorTimesMatrix(yi, br_centralY2X.value(), resArrayX);
                PCAUtils.sparseVectorTimesMatrix(yi, br_centralC.value(), resArrayYiC);

                PCAUtils.subtract(resArrayYiC, resArrayYmC);
                double dotRes = PCAUtils.dot(resArrayX, resArrayYiC);
                doubleAccumXctyt.add(dotRes);
            }
        }); //end Variance Job

        double xctyt = doubleAccumXctyt.value();
        ss = (norm2 + ss2 - 2 * xctyt) / (nRows * nCols);
        log.info("SSSSSSSSSSSSSSSSSSSSSSSSSSSS " + ss + " (" + norm2 + " + " + ss2 + " -2* " + xctyt);

        double objective = ss;
        relChangeInObjective = Math.abs(1 - objective / prevObjective);
        prevObjective = objective;
        log.info("Objective:  %.6f    relative change: %.6f \n", objective, relChangeInObjective);

        if (!CALCULATE_ERR_ATTHEEND) {
            log.info("Computing the error at round " + round + " ...");
            System.out.println("Computing the error at round " + round + " ...");

            double[] resArrayZmTmp = new double[centralC.numRows()];
            PCAUtils.vectorTimesMatrixTranspose(xm_mahout, centralC, resArrayZmTmp);
            final double[] resArrayZm = PCAUtils.subtractVectorArray(resArrayZmTmp, meanVector);

            //5. Reconstruction Error Job: The job computes the reconstruction error of the input matrix after updating the principal 
            //components matrix
            /**
             * Xc = Yc * Y2X
             * 
             * ReconY = Xc * C'
             * 
             * Err = ReconY - Yc
             * 
             * Norm2(Err) = abs(Err).zSum().max()
             * 
             * To take the sparse matrixes into account we receive the mean separately:
             * 
             * X = (Y - Ym) * Y2X = X - Xm, where X=Y*Y2X and Xm=Ym*Y2X
             * 
             * ReconY = (X - Xm) * C' = X*C' - Xm*C'
             * 
             * Err = X*C' - Xm*C' - (Y - Ym) = X*C' - Y - Zm, where where Zm=Xm*C'-Ym
             * 
             */

            final Accumulator<double[]> vectorAccumErr = sc.accumulator(new double[nCols],
                    new VectorAccumulatorAbsParam());
            final Accumulator<double[]> vectorAccumNormCentralized = sc.accumulator(new double[nCols],
                    new VectorAccumulatorAbsParam());
            final double[] xiCtArray = new double[centralC.numRows()];
            final double[] normalizedArray = new double[nCols];

            vectors.foreach(new VoidFunction<org.apache.spark.mllib.linalg.Vector>() {
                public void call(org.apache.spark.mllib.linalg.Vector yi) throws Exception {
                    if (PCAUtils.pass(errRate))
                        return;

                    PCAUtils.sparseVectorTimesMatrix(yi, br_centralY2X.value(), resArrayX);
                    PCAUtils.vectorTimesMatrixTranspose(resArrayX, br_centralC.value(), xiCtArray);

                    PCAUtils.denseVectorSubtractSparseSubtractDense(xiCtArray, yi, resArrayZm);
                    vectorAccumErr.add(xiCtArray);

                    PCAUtils.denseVectorMinusVector(yi, br_ym_mahout.value(), normalizedArray);
                    vectorAccumNormCentralized.add(normalizedArray);
                }
            }); // end Reconstruction Error Job

            /*
             * The norm of the reconstruction error matrix
             */
            double reconstructionError = PCAUtils.getMax(vectorAccumErr.value());
            log.info("************************ReconsructionError=" + reconstructionError);

            /*
             * The norm of the input matrix after centralization
            */
            double centralizedYNorm = PCAUtils.getMax(vectorAccumNormCentralized.value());
            log.info("************************CentralizedNOrm=" + centralizedYNorm);

            error = reconstructionError / centralizedYNorm;
            log.info("... end of computing the error at round " + round + " And error=" + error);

            prevError = error;
        }
    }
    if (computeProjectedMatrix == 1) {
        //7. Compute Projected Matrix Job: The job multiplies the input matrix Y with the principal components C to get the Projected vectors 
        final Broadcast<Matrix> br_centralC = sc.broadcast(centralC);
        JavaRDD<org.apache.spark.mllib.linalg.Vector> projectedVectors = vectors.map(
                new Function<org.apache.spark.mllib.linalg.Vector, org.apache.spark.mllib.linalg.Vector>() {

                    public org.apache.spark.mllib.linalg.Vector call(org.apache.spark.mllib.linalg.Vector yi)
                            throws Exception {

                        return PCAUtils.sparseVectorTimesMatrix(yi, br_centralC.value());
                    }
                });//End Compute Projected Matrix
        String path = outputPath + File.separator + "ProjectedMatrix";
        projectedVectors.saveAsTextFile(path);
    }
    return PCAUtils.convertMahoutToSparkMatrix(centralC);

}