Example usage for org.apache.mahout.math Matrix transpose

Introduction

In this page you can find the example usage for org.apache.mahout.math Matrix transpose.

Prototype

Matrix transpose();

Source Link

Document

Return a new matrix that is the transpose of the receiver

Usage

From source file:ca.uwaterloo.cpami.mahout.matrix.utils.GramSchmidt.java

License:Apache License

public static void main(String[] args) throws IOException {

    //final Configuration conf = new Configuration();
    //final FileSystem fs = FileSystem.get(conf);
    //final SequenceFile.Reader reader = new SequenceFile.Reader(fs,
    //   new Path("R1.dat"), conf);
    //IntWritable key = new IntWritable();
    //VectorWritable vec = new VectorWritable();
    Matrix mat = new SparseMatrix(1500, 100);
    //SparseRealMatrix mat2 = new OpenMapRealMatrix(12419,1500 );
    BufferedReader reader = new BufferedReader(new FileReader("R.3.csv"));
    String line = null;//  w  ww . j av  a 2 s  . co m
    while ((line = reader.readLine()) != null) {
        String[] parts = line.split(",");

        mat.set(Integer.parseInt(parts[0]), Integer.parseInt(parts[1]), Double.parseDouble(parts[2]));
        /*
        Vector v = vec.get();
        int i=0;
        Iterator<Vector.Element> itr = v.iterateNonZero();
        while(itr.hasNext()){
           double elem = itr.next().get();
           if(elem !=0)
              mat2.setEntry(i, key.get(), elem);
           i++;
        }
        */
    }

    //mat = mat.transpose();
    System.out.println(mat.viewColumn(0).isDense());
    System.out.println(mat.viewRow(0).isDense());
    mat = mat.transpose();
    GramSchmidt.orthonormalizeColumns(mat);
    /*
    System.out.println("started QR");
    System.out.println(Runtime.getRuntime().maxMemory());
    System.out.println(Runtime.getRuntime().maxMemory()-Runtime.getRuntime().freeMemory());
    QRDecomposition qr = new QRDecomposition(mat2);
    System.out.println(qr.getQ().getColumnDimension());
    System.out.println(qr.getQ().getRowDimension());
    */
    //mat = mat.transpose();
    //storeSparseColumns(mat);
    //for (int i = 0; i < 10; i++) {
    //   System.out.println(mat.viewRow(i).getNumNondefaultElements());
    //}

}

From source file:com.skp.experiment.cf.als.hadoop.ParallelALSFactorizationJobTest.java

License:Apache License

/**
 * small integration test that runs the full job
 *
 * <pre>//from  w  w  w .  j av  a 2  s  .co  m
 *
 *  user-item-matrix
 *
 *          burger  hotdog  berries  icecream
 *  dog       5       5        2        -
 *  rabbit    2       -        3        5
 *  cow       -       5        -        3
 *  donkey    3       -        -        5
 *
 * </pre>
 */
@Test
public void completeJobToyExample() throws Exception {

    Double na = Double.NaN;
    Matrix preferences = new SparseRowMatrix(4, 4,
            new Vector[] { new DenseVector(new double[] { 5.0, 5.0, 2.0, na }),
                    new DenseVector(new double[] { 2.0, na, 3.0, 5.0 }),
                    new DenseVector(new double[] { na, 5.0, na, 3.0 }),
                    new DenseVector(new double[] { 3.0, na, na, 5.0 }) });

    writeLines(inputFile, preferencesAsText(preferences));
    indexSizeFile.deleteOnExit();
    writeLines(indexSizeFile, "0,4\n1,4");

    ParallelALSFactorizationJob alsFactorization = new ParallelALSFactorizationJob();
    alsFactorization.setConf(conf);

    int numFeatures = 3;
    int numIterations = 5;
    double lambda = 0.065;

    alsFactorization
            .run(new String[] { "--input", inputFile.getAbsolutePath(), "--output", outputDir.getAbsolutePath(),
                    "--tempDir", tmpDir.getAbsolutePath(), "--lambda", String.valueOf(lambda), "--numFeatures",
                    String.valueOf(numFeatures), "--numIterations", String.valueOf(numIterations),
                    "--indexSizes", indexSizeFile.toString(), "--useTransform", "false" });
    Matrix u = MathHelper.readMatrix(conf, new Path(outputDir.getAbsolutePath(), "U/part-m-00000"),
            preferences.numRows(), numFeatures);
    Matrix m = MathHelper.readMatrix(conf, new Path(outputDir.getAbsolutePath(), "M/part-m-00000"),
            preferences.numCols(), numFeatures);

    StringBuilder info = new StringBuilder();
    info.append("\nA - users x items\n\n");
    info.append(MathHelper.nice(preferences));
    info.append("\nU - users x features\n\n");
    info.append(MathHelper.nice(u));
    info.append("\nM - items x features\n\n");
    info.append(MathHelper.nice(m));
    Matrix Ak = u.times(m.transpose());
    info.append("\nAk - users x items\n\n");
    info.append(MathHelper.nice(Ak));
    info.append('\n');

    log.info(info.toString());

    RunningAverage avg = new FullRunningAverage();
    Iterator<MatrixSlice> sliceIterator = preferences.iterateAll();
    while (sliceIterator.hasNext()) {
        MatrixSlice slice = sliceIterator.next();
        Iterator<Vector.Element> elementIterator = slice.vector().iterateNonZero();
        while (elementIterator.hasNext()) {
            Vector.Element e = elementIterator.next();
            if (!Double.isNaN(e.get())) {
                double pref = e.get();
                double estimate = u.viewRow(slice.index()).dot(m.viewRow(e.index()));
                double err = pref - estimate;
                avg.addDatum(err * err);
                log.info("Comparing preference of user [{}] towards item [{}], was [{}] estimate is [{}]",
                        new Object[] { slice.index(), e.index(), pref, estimate });
            }
        }
    }
    double rmse = Math.sqrt(avg.getAverage());
    log.info("RMSE: {}", rmse);

    assertTrue(rmse < 0.2);
}

From source file:com.skp.experiment.cf.als.hadoop.ParallelALSFactorizationJobTest.java

License:Apache License

@Test
public void completeJobImplicitToyExample() throws Exception {

    Matrix observations = new SparseRowMatrix(4, 4,
            new Vector[] { new DenseVector(new double[] { 5.0, 5.0, 2.0, 0 }),
                    new DenseVector(new double[] { 2.0, 0, 3.0, 5.0 }),
                    new DenseVector(new double[] { 0, 5.0, 0, 3.0 }),
                    new DenseVector(new double[] { 3.0, 0, 0, 5.0 }) });

    Matrix preferences = new SparseRowMatrix(4, 4,
            new Vector[] { new DenseVector(new double[] { 1.0, 1.0, 1.0, 0 }),
                    new DenseVector(new double[] { 1.0, 0, 1.0, 1.0 }),
                    new DenseVector(new double[] { 0, 1.0, 0, 1.0 }),
                    new DenseVector(new double[] { 1.0, 0, 0, 1.0 }) });

    writeLines(inputFile, preferencesAsText(observations));
    writeLines(indexSizeFile, "0,4\n1,4");
    ParallelALSFactorizationJob alsFactorization = new ParallelALSFactorizationJob();
    alsFactorization.setConf(conf);//from w  w  w  . j av a 2s  .  co m

    int numFeatures = 3;
    int numIterations = 5;
    double lambda = 0.065;
    double alpha = 20;

    alsFactorization.run(new String[] { "--input", inputFile.getAbsolutePath(), "--output",
            outputDir.getAbsolutePath(), "--tempDir", tmpDir.getAbsolutePath(), "--lambda",
            String.valueOf(lambda), "--implicitFeedback", String.valueOf(true), "--alpha",
            String.valueOf(alpha), "--numFeatures", String.valueOf(numFeatures), "--numIterations",
            String.valueOf(numIterations), "--indexSizes", indexSizeFile.toString(), "--useTransform",
            "false" });

    Matrix u = MathHelper.readMatrix(conf, new Path(outputDir.getAbsolutePath(), "U/part-m-00000"),
            observations.numRows(), numFeatures);
    Matrix m = MathHelper.readMatrix(conf, new Path(outputDir.getAbsolutePath(), "M/part-m-00000"),
            observations.numCols(), numFeatures);

    StringBuilder info = new StringBuilder();
    info.append("\nObservations - users x items\n");
    info.append(MathHelper.nice(observations));
    info.append("\nA - users x items\n\n");
    info.append(MathHelper.nice(preferences));
    info.append("\nU - users x features\n\n");
    info.append(MathHelper.nice(u));
    info.append("\nM - items x features\n\n");
    info.append(MathHelper.nice(m));
    Matrix Ak = u.times(m.transpose());
    info.append("\nAk - users x items\n\n");
    info.append(MathHelper.nice(Ak));
    info.append('\n');

    log.info(info.toString());

    RunningAverage avg = new FullRunningAverage();
    Iterator<MatrixSlice> sliceIterator = preferences.iterateAll();
    while (sliceIterator.hasNext()) {
        MatrixSlice slice = sliceIterator.next();
        for (Vector.Element e : slice.vector()) {
            if (!Double.isNaN(e.get())) {
                double pref = e.get();
                double estimate = u.viewRow(slice.index()).dot(m.viewRow(e.index()));
                double confidence = 1 + alpha * observations.getQuick(slice.index(), e.index());
                double err = confidence * (pref - estimate) * (pref - estimate);
                avg.addDatum(err);
                log.info(
                        "Comparing preference of user [{}] towards item [{}], was [{}] with confidence [{}] "
                                + "estimate is [{}]",
                        new Object[] { slice.index(), e.index(), pref, confidence, estimate });
            }
        }
    }
    double rmse = Math.sqrt(avg.getAverage());
    log.info("RMSE: {}", rmse);

    assertTrue(rmse < 0.4);
}

From source file:com.skp.experiment.math.als.hadoop.ImplicitFeedbackAlternatingLeastSquaresReasonSolver.java

License:Apache License

private Matrix YtransposeY(OpenIntObjectHashMap<Vector> Y) {

    Matrix compactedY = new DenseMatrix(Y.size(), numFeatures);
    IntArrayList indexes = Y.keys();//from w ww. j a v  a2  s  . c  om
    indexes.quickSort();

    int row = 0;
    for (int index : indexes.elements()) {
        compactedY.assignRow(row++, Y.get(index));
    }

    return compactedY.transpose().times(compactedY);
}

From source file:com.twitter.algebra.nmf.NMFDriver.java

License:Apache License

/**
 * Reads X and factorize it to X = A * Y, where X=r*c, A=r*k, Y=k*c
 * /*from   w w w  . j  ava  2s  .co  m*/
 * @param conf
 * @param input path to X in SequenceFileFormat
 * @param output
 * @param nRows
 * @param nCols
 * @param k
 * @param nColPartitionsB
 * @throws Exception
 */
private void run(final Configuration conf, Path input, Path output, int nRows, int nCols, int k,
        int nColPartitionsB) throws Exception {
    log.info("reading X");
    final DistributedRowMatrix distXOrig = new DistributedRowMatrix(input, getTempPath(), nRows, nCols);
    distXOrig.setConf(conf);

    log.info("converting X");
    final DistributedRowMatrix distX = Sequence2MatrixFormatJob.run(conf, distXOrig, "X");

    log.info("writing Xt");
    final DistributedRowMatrix distXt = TransposeJob.transpose(distX, conf, "Xt");

    log.info("summing Xt rows (X cols)");
    Path xtRowSumPath = RowSquareSumJob.run(conf, distXt, "Xt-rowsum");
    Path xColSumPath = xtRowSumPath;
    Vector xColSumVec = AlgebraCommon.mapDirToSparseVector(xColSumPath, 1, distX.numCols(), conf);

    log.info("sampling X");
    DistributedRowMatrix distXr = SampleColsJob.run(conf, distX, sampleRate, "Xr");
    log.info("writing Yt");
    DistributedRowMatrix distYt = DistRndMatrixJob.random(conf, nCols, k, getTempPath(), "Yt");

    log.info("writing A");
    DistributedRowMatrix distA = DistRndMatrixJob.random(conf, nRows, k, getTempPath(), "A");

    long errorChange = -1;
    long prevError = -1;
    //in early rounds sometimes error increases. we should not terminate after that
    for (int round = 0; round < MAX_ROUNDS && (errorChange < 0 || errorChange < MIN_ERROR_CHANGE); round++) {
        System.out.println("ROUND " + round + " ......");

        //run in parallel
        final int final_round1 = round;
        final DistributedRowMatrix finalYt = distYt;
        Runnable r = new Runnable() {
            @Override
            public void run() {
                threadOutMatrix = null;
                try {
                    threadOutMatrix = //no need to rerun Xt-col (use the same label)
                            AtB_DMJ.smartRun(conf, distXt, finalYt, "XYt" + final_round1, "Xt-col",
                                    "Yt-col" + final_round1);
                } catch (Exception e) {
                    e.printStackTrace();
                }
            }
        };
        otherThread = new Thread(r);
        otherThread.start();

        DistributedRowMatrix distYYt = new XtXJob().computeXtX(distYt, getTempPath(), conf, "YYt" + round);
        distYYt = CombinerJob.run(conf, distYYt, "YYt-compact" + round);

        otherThread.join();
        DistributedRowMatrix distXYt = threadOutMatrix;

        DistributedRowMatrix distAdotXYtdivAYYt = CompositeDMJ.run(conf, distA, distXYt, distYYt,
                "A.XYtdAYYt" + round, alpha1, alpha2);
        distA = distAdotXYtdivAYYt;

        //run in parallel
        final int final_round2 = round;
        final DistributedRowMatrix finalA = distA;
        r = new Runnable() {
            @Override
            public void run() {
                threadOutMatrix = null;
                try {
                    threadOutMatrix = //no need to rerun Xt-col (use the same label)
                            AtB_DMJ.smartRun(conf, distX, finalA, "XtA" + final_round2, "X-col",
                                    "A-col" + final_round2);
                } catch (Exception e) {
                    e.printStackTrace();
                }
            }
        };
        otherThread = new Thread(r);
        otherThread.start();

        DistributedRowMatrix distAAt = new XtXJob().computeXtX(distA, getTempPath(), conf, "AAt" + round);
        Matrix centAAt = AlgebraCommon.toDenseMatrix(distAAt);
        // TODO: AtA could be simply a distributed job
        Matrix centAtA = centAAt.transpose();
        DistributedRowMatrix distAtA = AlgebraCommon.toMapDir(centAtA, getTempPath(), getTempPath(),
                "AtA" + round);

        otherThread.join();
        DistributedRowMatrix distXtA = threadOutMatrix;

        DistributedRowMatrix distYtdotXtAdivYtAtA = CompositeDMJ.run(conf, distYt, distXtA, distAtA,
                "Yt.XtAdYtAtA" + round, lambda1, lambda2);
        distYt = distYtdotXtAdivYtAtA;

        DistributedRowMatrix distYtr = SampleRowsJob.run(conf, distYt, sampleRate, "Ytr" + round);
        distYtr = CombinerJob.run(conf, distYtr, "Ytr-compact" + round);
        long error = ErrDMJ.run(conf, distXr, xColSumVec, distA, distYtr, "ErrJob" + round);
        if (error != -1 && prevError != -1)
            errorChange = (prevError - error);
        prevError = error;
    }
}

From source file:com.ydy.cf.solver.impl.AlternatingLeastSquaresSolver.java

License:Apache License

private Vector solve(Iterable<Vector> featureVectors, Vector ratingVector, double lambda, int numFeatures) {

    Preconditions.checkNotNull(featureVectors, "Feature vectors cannot be null");
    Preconditions.checkArgument(!Iterables.isEmpty(featureVectors));
    Preconditions.checkNotNull(ratingVector, "rating vector cannot be null");
    Preconditions.checkArgument(ratingVector.getNumNondefaultElements() > 0, "Rating vector cannot be empty");
    Preconditions.checkArgument(Iterables.size(featureVectors) == ratingVector.getNumNondefaultElements());

    int nui = ratingVector.getNumNondefaultElements();

    Matrix MiIi = createMiIi(featureVectors, numFeatures);
    Matrix RiIiMaybeTransposed = createRiIiMaybeTransposed(ratingVector);

    /* compute Ai = MiIi * t(MiIi) + lambda * nui * E */
    Matrix Ai = addLambdaTimesNuiTimesE(MiIi.times(MiIi.transpose()), lambda, nui);
    /* compute Vi = MiIi * t(R(i,Ii)) */
    Matrix Vi = MiIi.times(RiIiMaybeTransposed);
    /* compute Ai * ui = Vi */
    return solve(Ai, Vi);
}

From source file:org.qcri.pca.SPCADriver.java

/**
 * Run sPCA//from   w  w w. jav  a  2s .com
 * 
 * @param conf
 *          the configuration
 * @param input
 *          the path to the input matrix Y
 * @param output
 *          the path to the output (currently for normalization output)
 * @param nRows
 *          number of rows in input matrix
 * @param nCols
 *          number of columns in input matrix
 * @param nPCs
 *          number of desired principal components
 * @param splitFactor
 *          divide the block size by this number to increase parallelism
 * @param round
 *          the initial round index, used for naming each round output
 * @param LAST_ROUND
 *          the index of the last round
 * @param sampleRate
 *          if < 1, the input is sampled during normalization
 * @return the error
 * @throws Exception
 */
double runMapReduce(Configuration conf, DistributedRowMatrix distY, InitialValues initVal, Path output,
        final int nRows, final int nCols, final int nPCs, final int splitFactor, final float errSampleRate,
        final int LAST_ROUND, final int normalize) throws Exception {
    int round = 0;
    //The two PPCA variables that improve over each iteration
    double ss = initVal.ss;
    Matrix centralC = initVal.C;
    //initial CtC
    Matrix centralCtC = centralC.transpose().times(centralC);
    final float threshold = 0.00001f;
    int sampleRate = 1;
    //1. compute mean and span
    DenseVector ym = new DenseVector(distY.numCols()); //ym=mean(distY)
    MeanAndSpanJob masJob = new MeanAndSpanJob();
    boolean normalizeMean = false;
    if (normalize == 1)
        normalizeMean = true;
    Path meanSpanPath = masJob.compuateMeanAndSpan(distY.getRowPath(), output, ym, normalizeMean, conf,
            "" + round + "-init");
    Path normalizedYPath = null;

    //2. normalize the input matrix Y
    if (normalize == 1) {

        NormalizeJob normalizeJob = new NormalizeJob();
        normalizedYPath = normalizeJob.normalize(conf, distY.getRowPath(), meanSpanPath, output, sampleRate,
                "" + round + "-init");
        distY = new DistributedRowMatrix(normalizedYPath, getTempPath(), nRows, nCols);
        distY.setConf(conf);
        //After normalization, set the split factor
        if (splitFactor > 1) {
            FileSystem fss = FileSystem.get(normalizedYPath.toUri(), conf);
            long blockSize = fss.getDefaultBlockSize() / splitFactor;
            conf.set("mapred.max.split.size", Long.toString(blockSize));
        }
    }
    if (normalizedYPath == null)
        normalizedYPath = distY.getRowPath();

    //3. compute the 2-norm of Y
    Norm2Job normJob = new Norm2Job();
    double norm2 = normJob.computeFNorm(conf, normalizedYPath, meanSpanPath, getTempPath(),
            "" + round + "-init");
    if (sampleRate < 1) { // rescale
        norm2 = norm2 / sampleRate;
    }

    DenseVector xm = new DenseVector(nPCs);
    log.info("SSSSSSSSSSSSSSSSSSSSSSSSSSSS " + ss);
    DistributedRowMatrix distY2X = null;
    DistributedRowMatrix distC = null;
    double prevObjective = Double.MAX_VALUE;
    double error = 0;
    double relChangeInObjective = Double.MAX_VALUE;
    double prevError = Double.MAX_VALUE;
    for (; (round < LAST_ROUND && relChangeInObjective > threshold && prevError > 0.02); round++) {
        // Sx = inv( ss * eye(d) + CtC );
        Matrix centralSx = centralCtC.clone();
        centralSx.viewDiagonal().assign(Functions.plus(ss));
        centralSx = inv(centralSx);
        // X = Y * C * Sx' => Y2X = C * Sx'
        Matrix centralY2X = centralC.times(centralSx.transpose());
        distY2X = PCACommon.toDistributedRowMatrix(centralY2X, getTempPath(), getTempPath(), "CSxt" + round);
        // Xm = Ym * Y2X
        PCACommon.denseVectorTimesMatrix(ym, centralY2X, xm);
        // We skip computing X as we generate it on demand using Y and Y2X

        //Compute X'X and Y'X 
        CompositeJob compositeJob = new CompositeJob();
        compositeJob.computeYtXandXtX(distY, distY2X, ym, xm, getTempPath(), conf, "" + round);
        Matrix centralXtX = compositeJob.xtx;
        Matrix centralYtX = compositeJob.ytx;
        if (sampleRate < 1) { // rescale
            centralXtX.assign(Functions.div(sampleRate));
            centralYtX.assign(Functions.div(sampleRate));
        }

        // XtX = X'*X + ss * Sx
        final double finalss = ss;
        centralXtX.assign(centralSx, new DoubleDoubleFunction() {
            @Override
            public double apply(double arg1, double arg2) {
                return arg1 + finalss * arg2;
            }
        });
        // C = (Ye'*X) / SumXtX;
        Matrix invXtX_central = inv(centralXtX);
        centralC = centralYtX.times(invXtX_central);
        distC = PCACommon.toDistributedRowMatrix(centralC, getTempPath(), getTempPath(), "C" + round);
        centralCtC = centralC.transpose().times(centralC);

        // Compute new value for ss
        // ss = ( sum(sum(Ye.^2)) + PCACommon.trace(XtX*CtC) - 2sum(XiCtYit) )
        // /(N*D);
        double ss2 = PCACommon.trace(centralXtX.times(centralCtC));
        VarianceJob varianceJob = new VarianceJob();
        double xctyt = varianceJob.computeVariance(distY, ym, distY2X, xm, distC, getTempPath(), conf,
                "" + round);
        if (sampleRate < 1) { // rescale
            xctyt = xctyt / sampleRate;
        }
        ss = (norm2 + ss2 - 2 * xctyt) / (nRows * nCols);
        log.info("SSSSSSSSSSSSSSSSSSSSSSSSSSSS " + ss + " (" + norm2 + " + " + ss2 + " -2* " + xctyt);
        double traceSx = PCACommon.trace(centralSx);
        double traceXtX = PCACommon.trace(centralXtX);
        double traceC = PCACommon.trace(centralC);
        double traceCtC = PCACommon.trace(centralCtC);
        log.info("TTTTTTTTTTTTTTTTT " + traceSx + " " + traceXtX + " " + traceC + " " + traceCtC);

        double objective = ss;
        relChangeInObjective = Math.abs(1 - objective / prevObjective);
        prevObjective = objective;
        log.info("Objective:  %.6f    relative change: %.6f \n", objective, relChangeInObjective);
        if (!CALCULATE_ERR_ATTHEEND) {
            log.info("Computing the error at round " + round + " ...");
            ReconstructionErrJob errJob = new ReconstructionErrJob();
            error = errJob.reconstructionErr(distY, distY2X, distC, centralC, ym, xm, errSampleRate, conf,
                    getTempPath(), "" + round);
            log.info("... end of computing the error at round " + round);
            prevError = error;
        }
    }

    if (CALCULATE_ERR_ATTHEEND) {
        log.info("Computing the error at round " + round + " ...");
        ReconstructionErrJob errJob = new ReconstructionErrJob();
        error = errJob.reconstructionErr(distY, distY2X, distC, centralC, ym, xm, errSampleRate, conf,
                getTempPath(), "" + round);
        log.info("... end of computing the error at round " + round);
    }

    initVal.C = centralC;
    initVal.ss = ss;
    writeMatrix(initVal.C, output, getTempPath(), "PCs");
    return error;

}

From source file:org.qcri.pca.SPCADriver.java

/**
 * Run PPCA sequentially given the small input Y which fit into memory This
 * could be used also on sampled data from a distributed matrix
 * //from www . jav  a 2s  .  c o  m
 * Note: this implementation ignore NaN values by replacing them with 0
 * 
 * @param conf
 *          the configuration
 * @param centralY
 *          the input matrix
 * @param initVal
 *          the initial values for C and ss
 * @param MAX_ROUNDS
 *          maximum number of iterations
 * @return the error
 * @throws Exception
 */
double runSequential(Configuration conf, Matrix centralY, InitialValues initVal, final int MAX_ROUNDS)
        throws Exception {
    Matrix centralC = initVal.C;
    double ss = initVal.ss;
    final int nRows = centralY.numRows();
    final int nCols = centralY.numCols();
    final int nPCs = centralC.numCols();
    final float threshold = 0.00001f;

    log.info("tracec= " + PCACommon.trace(centralC));
    //ignore NaN elements by replacing them with 0
    for (int r = 0; r < nRows; r++)
        for (int c = 0; c < nCols; c++)
            if (new Double(centralY.getQuick(r, c)).isNaN()) {
                centralY.setQuick(r, c, 0);
            }

    //centralize and normalize the input matrix
    Vector mean = centralY.aggregateColumns(new VectorFunction() {
        @Override
        public double apply(Vector v) {
            return v.zSum() / nRows;
        }
    });
    //also normalize the matrix by dividing each element by its columns range
    Vector spanVector = new DenseVector(nCols);
    for (int c = 0; c < nCols; c++) {
        Vector col = centralY.viewColumn(c);
        double max = col.maxValue();
        double min = col.minValue();
        double span = max - min;
        spanVector.setQuick(c, span);
    }
    for (int r = 0; r < nRows; r++)
        for (int c = 0; c < nCols; c++)
            centralY.set(r, c, (centralY.get(r, c) - mean.get(c))
                    / (spanVector.getQuick(c) != 0 ? spanVector.getQuick(c) : 1));

    Matrix centralCtC = centralC.transpose().times(centralC);
    log.info("tracectc= " + PCACommon.trace(centralCtC));
    log.info("traceinvctc= " + PCACommon.trace(inv(centralCtC)));
    log.info("traceye= " + PCACommon.trace(centralY));
    log.info("SSSSSSSSSSSSSSSSSSSSSSSSSSSS " + ss);

    int count = 1;
    // old = Inf;
    double old = Double.MAX_VALUE;
    // -------------------------- EM Iterations
    // while count
    Matrix centralX = null;
    int round = 0;
    while (round < MAX_ROUNDS && count > 0) {
        round++;
        // Sx = inv( eye(d) + CtC/ss );
        Matrix Sx = eye(nPCs).times(ss).plus(centralCtC);
        Sx = inv(Sx);
        // X = Ye*C*(Sx/ss);
        centralX = centralY.times(centralC).times(Sx.transpose());
        // XtX = X'*X + ss * Sx;
        Matrix centralXtX = centralX.transpose().times(centralX).plus(Sx.times(ss));
        // C = (Ye'*X) / XtX;
        Matrix tmpInv = inv(centralXtX);
        centralC = centralY.transpose().times(centralX).times(tmpInv);
        // CtC = C'*C;
        centralCtC = centralC.transpose().times(centralC);
        // ss = ( sum(sum( (X*C'-Ye).^2 )) + trace(XtX*CtC) - 2*xcty ) /(N*D);
        double norm2 = centralY.clone().assign(new DoubleFunction() {
            @Override
            public double apply(double arg1) {
                return arg1 * arg1;
            }
        }).zSum();
        ss = norm2 + PCACommon.trace(centralXtX.times(centralCtC));
        //ss3 = sum (X(i:0) * C' * Y(i,:)')
        DenseVector resVector = new DenseVector(nCols);
        double xctyt = 0;
        for (int i = 0; i < nRows; i++) {
            PCACommon.vectorTimesMatrixTranspose(centralX.viewRow(i), centralC, resVector);
            double res = resVector.dot(centralY.viewRow(i));
            xctyt += res;
        }
        ss -= 2 * xctyt;
        ss /= (nRows * nCols);

        log.info("SSSSSSSSSSSSSSSSSSSSSSSSSSSS " + ss);
        double traceSx = PCACommon.trace(Sx);
        double traceX = PCACommon.trace(centralX);
        double traceSumXtX = PCACommon.trace(centralXtX);
        double traceC = PCACommon.trace(centralC);
        double traceCtC = PCACommon.trace(centralCtC);
        log.info("TTTTTTTTTTTTTTTTT " + traceSx + " " + traceX + " " + traceSumXtX + " " + traceC + " "
                + traceCtC + " " + 0);

        double objective = ss;
        double rel_ch = Math.abs(1 - objective / old);
        old = objective;
        count++;
        if (rel_ch < threshold && count > 5)
            count = 0;
        log.info("Objective:  %.6f    relative change: %.6f \n", objective, rel_ch);
    }

    double norm1Y = centralY.aggregateColumns(new VectorNorm1()).maxValue();
    log.info("Norm1 of Ye is: " + norm1Y);
    Matrix newYerror = centralY.minus(centralX.times(centralC.transpose()));
    double norm1Err = newYerror.aggregateColumns(new VectorNorm1()).maxValue();
    log.info("Norm1 of the reconstruction error is: " + norm1Err);

    initVal.C = centralC;
    initVal.ss = ss;
    return norm1Err / norm1Y;
}

From source file:org.qcri.pca.SPCADriver.java

/**
 * Run PPCA sequentially given the small input Y which fit into memory This
 * could be used also on sampled data from a distributed matrix
 * //from   w  w w .jav  a 2s .  c o  m
 * Note: this implementation ignore NaN values by replacing them with 0
 * 
 * @param conf
 *          the configuration
 * @param centralY
 *          the input matrix
 * @param initVal
 *          the initial values for C and ss
 * @param MAX_ROUNDS
 *          maximum number of iterations
 * @return the error
 * @throws Exception
 */
double runSequential_JacobVersion(Configuration conf, Matrix centralY, InitialValues initVal,
        final int MAX_ROUNDS) {
    Matrix centralC = initVal.C;// the current implementation doesn't use initial ss of
    // initVal
    final int nRows = centralY.numRows();
    final int nCols = centralY.numCols();
    final int nPCs = centralC.numCols();
    final float threshold = 0.00001f;

    log.info("tracec= " + PCACommon.trace(centralC));
    // Y = Y - mean(Ye)
    // Also normalize the matrix
    for (int r = 0; r < nRows; r++)
        for (int c = 0; c < nCols; c++)
            if (new Double(centralY.getQuick(r, c)).isNaN()) {
                centralY.setQuick(r, c, 0);
            }
    Vector mean = centralY.aggregateColumns(new VectorFunction() {
        @Override
        public double apply(Vector v) {
            return v.zSum() / nRows;
        }
    });
    Vector spanVector = new DenseVector(nCols);
    for (int c = 0; c < nCols; c++) {
        Vector col = centralY.viewColumn(c);
        double max = col.maxValue();
        double min = col.minValue();
        double span = max - min;
        spanVector.setQuick(c, span);
    }
    for (int r = 0; r < nRows; r++)
        for (int c = 0; c < nCols; c++)
            centralY.set(r, c, (centralY.get(r, c) - mean.get(c))
                    / (spanVector.getQuick(c) != 0 ? spanVector.getQuick(c) : 1));

    // -------------------------- initialization
    // CtC = C'*C;
    Matrix centralCtC = centralC.transpose().times(centralC);
    log.info("tracectc= " + PCACommon.trace(centralCtC));
    log.info("traceinvctc= " + PCACommon.trace(inv(centralCtC)));
    log.info("traceye= " + PCACommon.trace(centralY));
    // X = Ye * C * inv(CtC);
    Matrix centralX = centralY.times(centralC).times(inv(centralCtC));
    log.info("tracex= " + PCACommon.trace(centralX));
    // recon = X * C';
    Matrix recon = centralX.times(centralC.transpose());
    log.info("tracerec= " + PCACommon.trace(recon));
    // ss = sum(sum((recon-Ye).^2)) / (N*D-missing);
    double ss = recon.minus(centralY).assign(new DoubleFunction() {
        @Override
        public double apply(double arg1) {
            return arg1 * arg1;
        }
    }).zSum() / (nRows * nCols);
    log.info("SSSSSSSSSSSSSSSSSSSSSSSSSSSS " + ss);

    int count = 1;
    // old = Inf;
    double old = Double.MAX_VALUE;
    // -------------------------- EM Iterations
    // while count
    int round = 0;
    while (round < MAX_ROUNDS && count > 0) {
        round++;
        // ------------------ E-step, (co)variances
        // Sx = inv( eye(d) + CtC/ss );
        Matrix centralSx = eye(nPCs).plus(centralCtC.divide(ss));
        centralSx = inv(centralSx);
        // ------------------ E-step expected value
        // X = Ye*C*(Sx/ss);
        centralX = centralY.times(centralC).times(centralSx.divide(ss));
        // ------------------ M-step
        // SumXtX = X'*X;
        Matrix centralSumXtX = centralX.transpose().times(centralX);
        // C = (Ye'*X) / (SumXtX + N*Sx );
        Matrix tmpInv = inv(centralSumXtX.plus(centralSx.times(nRows)));
        centralC = centralY.transpose().times(centralX).times(tmpInv);
        // CtC = C'*C;
        centralCtC = centralC.transpose().times(centralC);
        // ss = ( sum(sum( (X*C'-Ye).^2 )) + N*sum(sum(CtC.*Sx)) +
        // missing*ss_old ) /(N*D);
        recon = centralX.times(centralC.transpose());
        double error = recon.minus(centralY).assign(new DoubleFunction() {
            @Override
            public double apply(double arg1) {
                return arg1 * arg1;
            }
        }).zSum();
        ss = error + nRows * dot(centralCtC.clone(), centralSx).zSum();
        ss /= (nRows * nCols);

        log.info("SSSSSSSSSSSSSSSSSSSSSSSSSSSS " + ss);
        double traceSx = PCACommon.trace(centralSx);
        double traceX = PCACommon.trace(centralX);
        double traceSumXtX = PCACommon.trace(centralSumXtX);
        double traceC = PCACommon.trace(centralC);
        double traceCtC = PCACommon.trace(centralCtC);
        log.info("TTTTTTTTTTTTTTTTT " + traceSx + " " + traceX + " " + traceSumXtX + " " + traceC + " "
                + traceCtC + " " + 0);

        // objective = N*D + N*(D*log(ss) +PCACommon.trace(Sx)-log(det(Sx)) )
        // +PCACommon.trace(SumXtX) -missing*log(ss_old);
        double objective = nRows * nCols + nRows
                * (nCols * Math.log(ss) + PCACommon.trace(centralSx) - Math.log(centralSx.determinant()))
                + PCACommon.trace(centralSumXtX);
        double rel_ch = Math.abs(1 - objective / old);
        old = objective;
        count++;
        if (rel_ch < threshold && count > 5)
            count = 0;
        System.out.printf("Objective:  %.6f    relative change: %.6f \n", objective, rel_ch);
    }

    double norm1Y = centralY.aggregateColumns(new VectorNorm1()).maxValue();
    log.info("Norm1 of Y is: " + norm1Y);
    Matrix newYerror = centralY.minus(centralX.times(centralC.transpose()));
    double norm1Err = newYerror.aggregateColumns(new VectorNorm1()).maxValue();
    log.info("Norm1 of the reconstruction error is: " + norm1Err);

    initVal.C = centralC;
    initVal.ss = ss;
    return norm1Err / norm1Y;
}

From source file:org.qcri.sparkpca.SparkPCA.java

/**
 * Compute principal component analysis where the input is an RDD<org.apache.spark.mllib.linalg.Vector> of vectors such that each vector represents a row in the matrix
 * @param sc //  w  w w.ja  v a2  s  . com
 *           Spark context that contains the configuration parameters and represents connection to the cluster 
 *          (used to create RDDs, accumulators and broadcast variables on that cluster)
 * @param vectors
 *          An RDD of vectors representing the rows of the input matrix
 * @param nRows
 *          Number of rows in input Matrix
 * @param nCols
 *          Number of columns in input Matrix
 * @param nPCs
 *          Number of desired principal components
 * @param errRate
 *          The sampling rate that is used for computing the reconstruction error
 * @param maxIterations 
 *          Maximum number of iterations before terminating
 * @return Matrix of size nCols X nPCs having the desired principal components
 */
public static org.apache.spark.mllib.linalg.Matrix computePrincipalComponents(JavaSparkContext sc,
        JavaRDD<org.apache.spark.mllib.linalg.Vector> vectors, String outputPath, final int nRows,
        final int nCols, final int nPCs, final double errRate, final int maxIterations,
        final int computeProjectedMatrix) {

    System.out.println("Rows: " + nRows + ", Columns " + nCols);
    double ss;
    Matrix centralC;

    // The two PPCA variables that improve over each iteration: Principal component matrix (C) and random variance (ss).
    // Initialized randomly for the first iteration
    ss = PCAUtils.randSS();
    centralC = PCAUtils.randomMatrix(nCols, nPCs);

    // 1. Mean Job : This job calculates the mean and span of the columns of the input RDD<org.apache.spark.mllib.linalg.Vector>
    final Accumulator<double[]> matrixAccumY = sc.accumulator(new double[nCols], new VectorAccumulatorParam());
    final double[] internalSumY = new double[nCols];
    vectors.foreachPartition(new VoidFunction<Iterator<org.apache.spark.mllib.linalg.Vector>>() {

        public void call(Iterator<org.apache.spark.mllib.linalg.Vector> arg0) throws Exception {
            org.apache.spark.mllib.linalg.Vector yi;
            int[] indices = null;
            int i;
            while (arg0.hasNext()) {
                yi = arg0.next();
                indices = ((SparseVector) yi).indices();
                for (i = 0; i < indices.length; i++) {
                    internalSumY[indices[i]] += yi.apply(indices[i]);
                }
            }
            matrixAccumY.add(internalSumY);
        }

    });//End Mean Job

    //Get the sum of column Vector from the accumulator and divide each element by the number of rows to get the mean
    final Vector meanVector = new DenseVector(matrixAccumY.value()).divide(nRows);
    final Broadcast<Vector> br_ym_mahout = sc.broadcast(meanVector);

    //2. Frobenious Norm Job : Obtain Frobenius norm of the input RDD<org.apache.spark.mllib.linalg.Vector>
    /**
     * To compute the norm2 of a sparse matrix, iterate over sparse items and sum
     * square of the difference. After processing each row, add the sum of the
     * meanSquare of the zero-elements that were ignored in the sparse iteration.
    */
    double meanSquareSumTmp = 0;
    for (int i = 0; i < br_ym_mahout.value().size(); i++) {
        double element = br_ym_mahout.value().getQuick(i);
        meanSquareSumTmp += element * element;
    }
    final double meanSquareSum = meanSquareSumTmp;
    final Accumulator<Double> doubleAccumNorm2 = sc.accumulator(0.0);
    vectors.foreach(new VoidFunction<org.apache.spark.mllib.linalg.Vector>() {

        public void call(org.apache.spark.mllib.linalg.Vector yi) throws Exception {
            double norm2 = 0;
            double meanSquareSumOfZeroElements = meanSquareSum;
            int[] indices = ((SparseVector) yi).indices();
            int i;
            int index;
            double v;
            //iterate over non
            for (i = 0; i < indices.length; i++) {
                index = indices[i];
                v = yi.apply(index);
                double mean = br_ym_mahout.value().getQuick(index);
                double diff = v - mean;
                diff *= diff;

                // cancel the effect of the non-zero element in meanSquareSum
                meanSquareSumOfZeroElements -= mean * mean;
                norm2 += diff;
            }
            // For all all zero items, the following has the sum of mean square
            norm2 += meanSquareSumOfZeroElements;
            doubleAccumNorm2.add(norm2);
        }

    });//end Frobenious Norm Job

    double norm2 = doubleAccumNorm2.value();
    log.info("NOOOOORM2=" + norm2);

    // initial CtC  
    Matrix centralCtC = centralC.transpose().times(centralC);

    Matrix centralY2X = null;
    // -------------------------- EM Iterations
    // while count
    int round = 0;
    double prevObjective = Double.MAX_VALUE;
    double error = 0;
    double relChangeInObjective = Double.MAX_VALUE;
    double prevError = Double.MAX_VALUE;
    final float threshold = 0.00001f;
    final int firstRound = round;

    for (; (round < maxIterations && relChangeInObjective > threshold && prevError > 0.02); round++) {

        // Sx = inv( ss * eye(d) + CtC );
        Matrix centralSx = centralCtC.clone();

        // Sx = inv( eye(d) + CtC/ss );
        centralSx.viewDiagonal().assign(Functions.plus(ss));
        centralSx = PCAUtils.inv(centralSx);

        // X = Y * C * Sx' => Y2X = C * Sx'
        centralY2X = centralC.times(centralSx.transpose());

        //Broadcast Y2X because it will be used in several jobs and several iterations.
        final Broadcast<Matrix> br_centralY2X = sc.broadcast(centralY2X);

        // Xm = Ym * Y2X
        Vector xm_mahout = new DenseVector(nPCs);
        xm_mahout = PCAUtils.denseVectorTimesMatrix(br_ym_mahout.value(), centralY2X, xm_mahout);

        //Broadcast Xm because it will be used in several iterations.
        final Broadcast<Vector> br_xm_mahout = sc.broadcast(xm_mahout);
        // We skip computing X as we generate it on demand using Y and Y2X

        // 3. X'X and Y'X Job:  The job computes the two matrices X'X and Y'X
        /**
         * Xc = Yc * MEM   (MEM is the in-memory broadcasted matrix Y2X)
         * 
         * XtX = Xc' * Xc
         * 
         * YtX = Yc' * Xc
         * 
         * It also considers that Y is sparse and receives the mean vectors Ym and Xm
         * separately.
         * 
         * Yc = Y - Ym
         * 
         * Xc = X - Xm
         * 
         * Xc = (Y - Ym) * MEM = Y * MEM - Ym * MEM = X - Xm
         * 
         * XtX = (X - Xm)' * (X - Xm)
         * 
         * YtX = (Y - Ym)' * (X - Xm)
         * 
        */
        final Accumulator<double[][]> matrixAccumXtx = sc.accumulator(new double[nPCs][nPCs],
                new MatrixAccumulatorParam());
        final Accumulator<double[][]> matrixAccumYtx = sc.accumulator(new double[nCols][nPCs],
                new MatrixAccumulatorParam());
        final Accumulator<double[]> matrixAccumX = sc.accumulator(new double[nPCs],
                new VectorAccumulatorParam());

        /*
         * Initialize the output matrices and vectors once in order to avoid generating massive intermediate data in the workers
         */
        final double[][] resArrayYtX = new double[nCols][nPCs];
        final double[][] resArrayXtX = new double[nPCs][nPCs];
        final double[] resArrayX = new double[nPCs];

        /*
         * Used to sum the vectors in one partition.
        */
        final double[][] internalSumYtX = new double[nCols][nPCs];
        final double[][] internalSumXtX = new double[nPCs][nPCs];
        final double[] internalSumX = new double[nPCs];

        vectors.foreachPartition(new VoidFunction<Iterator<org.apache.spark.mllib.linalg.Vector>>() {

            public void call(Iterator<org.apache.spark.mllib.linalg.Vector> arg0) throws Exception {
                org.apache.spark.mllib.linalg.Vector yi;
                while (arg0.hasNext()) {
                    yi = arg0.next();

                    /*
                      * Perform in-memory matrix multiplication xi = yi' * Y2X
                     */
                    PCAUtils.sparseVectorTimesMatrix(yi, br_centralY2X.value(), resArrayX);

                    //get only the sparse indices
                    int[] indices = ((SparseVector) yi).indices();

                    PCAUtils.outerProductWithIndices(yi, br_ym_mahout.value(), resArrayX, br_xm_mahout.value(),
                            resArrayYtX, indices);
                    PCAUtils.outerProductArrayInput(resArrayX, br_xm_mahout.value(), resArrayX,
                            br_xm_mahout.value(), resArrayXtX);
                    int i, j, rowIndexYtX;

                    //add the sparse indices only
                    for (i = 0; i < indices.length; i++) {
                        rowIndexYtX = indices[i];
                        for (j = 0; j < nPCs; j++) {
                            internalSumYtX[rowIndexYtX][j] += resArrayYtX[rowIndexYtX][j];
                            resArrayYtX[rowIndexYtX][j] = 0; //reset it
                        }

                    }
                    for (i = 0; i < nPCs; i++) {
                        internalSumX[i] += resArrayX[i];
                        for (j = 0; j < nPCs; j++) {
                            internalSumXtX[i][j] += resArrayXtX[i][j];
                            resArrayXtX[i][j] = 0; //reset it
                        }

                    }
                }
                matrixAccumX.add(internalSumX);
                matrixAccumXtx.add(internalSumXtX);
                matrixAccumYtx.add(internalSumYtX);
            }

        });//end  X'X and Y'X Job

        /*
         * Get the values of the accumulators.
        */
        Matrix centralYtX = new DenseMatrix(matrixAccumYtx.value());
        Matrix centralXtX = new DenseMatrix(matrixAccumXtx.value());
        Vector centralSumX = new DenseVector(matrixAccumX.value());

        /*
          * Mi = (Yi-Ym)' x (Xi-Xm) = Yi' x (Xi-Xm) - Ym' x (Xi-Xm)
          * 
          * M = Sum(Mi) = Sum(Yi' x (Xi-Xm)) - Ym' x (Sum(Xi)-N*Xm)
          * 
          * The first part is done in the previous job and the second in the following method
         */
        centralYtX = PCAUtils.updateXtXAndYtx(centralYtX, centralSumX, br_ym_mahout.value(), xm_mahout, nRows);
        centralXtX = PCAUtils.updateXtXAndYtx(centralXtX, centralSumX, xm_mahout, xm_mahout, nRows);

        // XtX = X'*X + ss * Sx
        final double finalss = ss;
        centralXtX.assign(centralSx, new DoubleDoubleFunction() {
            @Override
            public double apply(double arg1, double arg2) {
                return arg1 + finalss * arg2;
            }
        });

        // C = (Ye'*X) / SumXtX;
        Matrix invXtX_central = PCAUtils.inv(centralXtX);
        centralC = centralYtX.times(invXtX_central);
        centralCtC = centralC.transpose().times(centralC);

        // Compute new value for ss
        // ss = ( sum(sum(Ye.^2)) + trace(XtX*CtC) - 2sum(XiCtYit))/(N*D);

        //4. Variance Job: Computes part of variance that requires a distributed job
        /**
         * xcty = Sum (xi * C' * yi')
         * 
         * We also regenerate xi on demand by the following formula:
         * 
         * xi = yi * y2x
         * 
         * To make it efficient for sparse matrices that are not mean-centered, we receive the mean
         * separately:
         * 
         * xi = (yi - ym) * y2x = yi * y2x - xm, where xm = ym*y2x
         * 
         * xi * C' * (yi-ym)' = xi * ((yi-ym)*C)' = xi * (yi*C - ym*C)'
         * 
         */

        double ss2 = PCAUtils.trace(centralXtX.times(centralCtC));
        final double[] resArrayYmC = new double[centralC.numCols()];
        PCAUtils.denseVectorTimesMatrix(meanVector, centralC, resArrayYmC);
        final Broadcast<Matrix> br_centralC = sc.broadcast(centralC);
        final double[] resArrayYiC = new double[centralC.numCols()];
        final Accumulator<Double> doubleAccumXctyt = sc.accumulator(0.0);
        vectors.foreach(new VoidFunction<org.apache.spark.mllib.linalg.Vector>() {
            public void call(org.apache.spark.mllib.linalg.Vector yi) throws Exception {
                PCAUtils.sparseVectorTimesMatrix(yi, br_centralY2X.value(), resArrayX);
                PCAUtils.sparseVectorTimesMatrix(yi, br_centralC.value(), resArrayYiC);

                PCAUtils.subtract(resArrayYiC, resArrayYmC);
                double dotRes = PCAUtils.dot(resArrayX, resArrayYiC);
                doubleAccumXctyt.add(dotRes);
            }
        }); //end Variance Job

        double xctyt = doubleAccumXctyt.value();
        ss = (norm2 + ss2 - 2 * xctyt) / (nRows * nCols);
        log.info("SSSSSSSSSSSSSSSSSSSSSSSSSSSS " + ss + " (" + norm2 + " + " + ss2 + " -2* " + xctyt);

        double objective = ss;
        relChangeInObjective = Math.abs(1 - objective / prevObjective);
        prevObjective = objective;
        log.info("Objective:  %.6f    relative change: %.6f \n", objective, relChangeInObjective);

        if (!CALCULATE_ERR_ATTHEEND) {
            log.info("Computing the error at round " + round + " ...");
            System.out.println("Computing the error at round " + round + " ...");

            double[] resArrayZmTmp = new double[centralC.numRows()];
            PCAUtils.vectorTimesMatrixTranspose(xm_mahout, centralC, resArrayZmTmp);
            final double[] resArrayZm = PCAUtils.subtractVectorArray(resArrayZmTmp, meanVector);

            //5. Reconstruction Error Job: The job computes the reconstruction error of the input matrix after updating the principal 
            //components matrix
            /**
             * Xc = Yc * Y2X
             * 
             * ReconY = Xc * C'
             * 
             * Err = ReconY - Yc
             * 
             * Norm2(Err) = abs(Err).zSum().max()
             * 
             * To take the sparse matrixes into account we receive the mean separately:
             * 
             * X = (Y - Ym) * Y2X = X - Xm, where X=Y*Y2X and Xm=Ym*Y2X
             * 
             * ReconY = (X - Xm) * C' = X*C' - Xm*C'
             * 
             * Err = X*C' - Xm*C' - (Y - Ym) = X*C' - Y - Zm, where where Zm=Xm*C'-Ym
             * 
             */

            final Accumulator<double[]> vectorAccumErr = sc.accumulator(new double[nCols],
                    new VectorAccumulatorAbsParam());
            final Accumulator<double[]> vectorAccumNormCentralized = sc.accumulator(new double[nCols],
                    new VectorAccumulatorAbsParam());
            final double[] xiCtArray = new double[centralC.numRows()];
            final double[] normalizedArray = new double[nCols];

            vectors.foreach(new VoidFunction<org.apache.spark.mllib.linalg.Vector>() {
                public void call(org.apache.spark.mllib.linalg.Vector yi) throws Exception {
                    if (PCAUtils.pass(errRate))
                        return;

                    PCAUtils.sparseVectorTimesMatrix(yi, br_centralY2X.value(), resArrayX);
                    PCAUtils.vectorTimesMatrixTranspose(resArrayX, br_centralC.value(), xiCtArray);

                    PCAUtils.denseVectorSubtractSparseSubtractDense(xiCtArray, yi, resArrayZm);
                    vectorAccumErr.add(xiCtArray);

                    PCAUtils.denseVectorMinusVector(yi, br_ym_mahout.value(), normalizedArray);
                    vectorAccumNormCentralized.add(normalizedArray);
                }
            }); // end Reconstruction Error Job

            /*
             * The norm of the reconstruction error matrix
             */
            double reconstructionError = PCAUtils.getMax(vectorAccumErr.value());
            log.info("************************ReconsructionError=" + reconstructionError);

            /*
             * The norm of the input matrix after centralization
            */
            double centralizedYNorm = PCAUtils.getMax(vectorAccumNormCentralized.value());
            log.info("************************CentralizedNOrm=" + centralizedYNorm);

            error = reconstructionError / centralizedYNorm;
            log.info("... end of computing the error at round " + round + " And error=" + error);

            prevError = error;
        }
    }
    if (computeProjectedMatrix == 1) {
        //7. Compute Projected Matrix Job: The job multiplies the input matrix Y with the principal components C to get the Projected vectors 
        final Broadcast<Matrix> br_centralC = sc.broadcast(centralC);
        JavaRDD<org.apache.spark.mllib.linalg.Vector> projectedVectors = vectors.map(
                new Function<org.apache.spark.mllib.linalg.Vector, org.apache.spark.mllib.linalg.Vector>() {

                    public org.apache.spark.mllib.linalg.Vector call(org.apache.spark.mllib.linalg.Vector yi)
                            throws Exception {

                        return PCAUtils.sparseVectorTimesMatrix(yi, br_centralC.value());
                    }
                });//End Compute Projected Matrix
        String path = outputPath + File.separator + "ProjectedMatrix";
        projectedVectors.saveAsTextFile(path);
    }
    return PCAUtils.convertMahoutToSparkMatrix(centralC);

}