List of usage examples for org.apache.mahout.math Matrix clone
Matrix clone();
From source file:com.cloudera.knittingboar.sgd.ParallelOnlineLogisticRegression.java
License:Apache License
public void SetBeta(Matrix beta_mstr_cpy) { this.beta = beta_mstr_cpy.clone(); }
From source file:edu.snu.dolphin.bsp.examples.ml.algorithms.clustering.em.EMMainCmpTask.java
License:Apache License
/** * Return a new matrix containing the product of each value of the recipient and the argument. * This method exploits sparsity of the matrix, that is, considers only non-zero entries. *//* w w w.j av a2 s. c o m*/ private Matrix times(final Matrix matrix, final double scala) { final Matrix result = matrix.clone(); final Iterator<MatrixSlice> sliceIterator = matrix.iterator(); while (sliceIterator.hasNext()) { final MatrixSlice slice = sliceIterator.next(); final int row = slice.index(); for (final Vector.Element e : slice.nonZeroes()) { final int col = e.index(); result.set(row, col, e.get() * scala); } } return result; }
From source file:edu.snu.dolphin.bsp.examples.ml.data.ClusterStats.java
License:Apache License
/** * We may select whether to create a deep copy of @member pointSum and @member outProdSum, or just a reference. * @param outProdSum//from w w w .jav a 2s . c o m * @param pointSum * @param probSum * @param isDeepCopy */ public ClusterStats(final Matrix outProdSum, final Vector pointSum, final double probSum, final boolean isDeepCopy) { if (isDeepCopy) { this.outProdSum = outProdSum.clone(); this.pointSum = pointSum.clone(); } else { this.outProdSum = outProdSum; this.pointSum = pointSum; } this.probSum = probSum; }
From source file:org.qcri.pca.PCATest.java
License:Apache License
@Test public void crossTestIterationOfMapReducePPCASequentialPPCA() throws Exception { Matrix C_central = PCACommon.randomMatrix(D, d); double ss = PCACommon.randSS(); InitialValues initValSeq = new InitialValues(C_central, ss); InitialValues initValMR = new InitialValues(C_central.clone(), ss); //1. run sequential Matrix Ye_central = new DenseMatrix(N, D); int row = 0;/*w w w . j a va2 s.c om*/ for (VectorWritable vw : new SequenceFileDirValueIterable<VectorWritable>(input, PathType.LIST, null, conf)) { Ye_central.assignRow(row, vw.get()); row++; } double bishopSeqErr = ppcaDriver.runSequential(conf, Ye_central, initValSeq, 1); //2. run mapreduce DistributedRowMatrix Ye = new DistributedRowMatrix(input, tmp, N, D); Ye.setConf(conf); double bishopMRErr = ppcaDriver.runMapReduce(conf, Ye, initValMR, output, N, D, d, 1, 1, 1, 1); Assert.assertEquals("ss value is different in sequential and mapreduce PCA", initValSeq.ss, initValMR.ss, EPSILON); double seqCTrace = PCACommon.trace(initValSeq.C); double mrCTrace = PCACommon.trace(initValMR.C); Assert.assertEquals("C value is different in sequential and mapreduce PCA", seqCTrace, mrCTrace, EPSILON); Assert.assertEquals("The PPCA error between sequntial and mapreduce methods is too different: " + bishopSeqErr + "!= " + bishopMRErr, bishopSeqErr, bishopMRErr, EPSILON); }
From source file:org.qcri.pca.SPCADriver.java
/** * Run sPCA//ww w . j a va2 s. co m * * @param conf * the configuration * @param input * the path to the input matrix Y * @param output * the path to the output (currently for normalization output) * @param nRows * number of rows in input matrix * @param nCols * number of columns in input matrix * @param nPCs * number of desired principal components * @param splitFactor * divide the block size by this number to increase parallelism * @param round * the initial round index, used for naming each round output * @param LAST_ROUND * the index of the last round * @param sampleRate * if < 1, the input is sampled during normalization * @return the error * @throws Exception */ double runMapReduce(Configuration conf, DistributedRowMatrix distY, InitialValues initVal, Path output, final int nRows, final int nCols, final int nPCs, final int splitFactor, final float errSampleRate, final int LAST_ROUND, final int normalize) throws Exception { int round = 0; //The two PPCA variables that improve over each iteration double ss = initVal.ss; Matrix centralC = initVal.C; //initial CtC Matrix centralCtC = centralC.transpose().times(centralC); final float threshold = 0.00001f; int sampleRate = 1; //1. compute mean and span DenseVector ym = new DenseVector(distY.numCols()); //ym=mean(distY) MeanAndSpanJob masJob = new MeanAndSpanJob(); boolean normalizeMean = false; if (normalize == 1) normalizeMean = true; Path meanSpanPath = masJob.compuateMeanAndSpan(distY.getRowPath(), output, ym, normalizeMean, conf, "" + round + "-init"); Path normalizedYPath = null; //2. normalize the input matrix Y if (normalize == 1) { NormalizeJob normalizeJob = new NormalizeJob(); normalizedYPath = normalizeJob.normalize(conf, distY.getRowPath(), meanSpanPath, output, sampleRate, "" + round + "-init"); distY = new DistributedRowMatrix(normalizedYPath, getTempPath(), nRows, nCols); distY.setConf(conf); //After normalization, set the split factor if (splitFactor > 1) { FileSystem fss = FileSystem.get(normalizedYPath.toUri(), conf); long blockSize = fss.getDefaultBlockSize() / splitFactor; conf.set("mapred.max.split.size", Long.toString(blockSize)); } } if (normalizedYPath == null) normalizedYPath = distY.getRowPath(); //3. compute the 2-norm of Y Norm2Job normJob = new Norm2Job(); double norm2 = normJob.computeFNorm(conf, normalizedYPath, meanSpanPath, getTempPath(), "" + round + "-init"); if (sampleRate < 1) { // rescale norm2 = norm2 / sampleRate; } DenseVector xm = new DenseVector(nPCs); log.info("SSSSSSSSSSSSSSSSSSSSSSSSSSSS " + ss); DistributedRowMatrix distY2X = null; DistributedRowMatrix distC = null; double prevObjective = Double.MAX_VALUE; double error = 0; double relChangeInObjective = Double.MAX_VALUE; double prevError = Double.MAX_VALUE; for (; (round < LAST_ROUND && relChangeInObjective > threshold && prevError > 0.02); round++) { // Sx = inv( ss * eye(d) + CtC ); Matrix centralSx = centralCtC.clone(); centralSx.viewDiagonal().assign(Functions.plus(ss)); centralSx = inv(centralSx); // X = Y * C * Sx' => Y2X = C * Sx' Matrix centralY2X = centralC.times(centralSx.transpose()); distY2X = PCACommon.toDistributedRowMatrix(centralY2X, getTempPath(), getTempPath(), "CSxt" + round); // Xm = Ym * Y2X PCACommon.denseVectorTimesMatrix(ym, centralY2X, xm); // We skip computing X as we generate it on demand using Y and Y2X //Compute X'X and Y'X CompositeJob compositeJob = new CompositeJob(); compositeJob.computeYtXandXtX(distY, distY2X, ym, xm, getTempPath(), conf, "" + round); Matrix centralXtX = compositeJob.xtx; Matrix centralYtX = compositeJob.ytx; if (sampleRate < 1) { // rescale centralXtX.assign(Functions.div(sampleRate)); centralYtX.assign(Functions.div(sampleRate)); } // XtX = X'*X + ss * Sx final double finalss = ss; centralXtX.assign(centralSx, new DoubleDoubleFunction() { @Override public double apply(double arg1, double arg2) { return arg1 + finalss * arg2; } }); // C = (Ye'*X) / SumXtX; Matrix invXtX_central = inv(centralXtX); centralC = centralYtX.times(invXtX_central); distC = PCACommon.toDistributedRowMatrix(centralC, getTempPath(), getTempPath(), "C" + round); centralCtC = centralC.transpose().times(centralC); // Compute new value for ss // ss = ( sum(sum(Ye.^2)) + PCACommon.trace(XtX*CtC) - 2sum(XiCtYit) ) // /(N*D); double ss2 = PCACommon.trace(centralXtX.times(centralCtC)); VarianceJob varianceJob = new VarianceJob(); double xctyt = varianceJob.computeVariance(distY, ym, distY2X, xm, distC, getTempPath(), conf, "" + round); if (sampleRate < 1) { // rescale xctyt = xctyt / sampleRate; } ss = (norm2 + ss2 - 2 * xctyt) / (nRows * nCols); log.info("SSSSSSSSSSSSSSSSSSSSSSSSSSSS " + ss + " (" + norm2 + " + " + ss2 + " -2* " + xctyt); double traceSx = PCACommon.trace(centralSx); double traceXtX = PCACommon.trace(centralXtX); double traceC = PCACommon.trace(centralC); double traceCtC = PCACommon.trace(centralCtC); log.info("TTTTTTTTTTTTTTTTT " + traceSx + " " + traceXtX + " " + traceC + " " + traceCtC); double objective = ss; relChangeInObjective = Math.abs(1 - objective / prevObjective); prevObjective = objective; log.info("Objective: %.6f relative change: %.6f \n", objective, relChangeInObjective); if (!CALCULATE_ERR_ATTHEEND) { log.info("Computing the error at round " + round + " ..."); ReconstructionErrJob errJob = new ReconstructionErrJob(); error = errJob.reconstructionErr(distY, distY2X, distC, centralC, ym, xm, errSampleRate, conf, getTempPath(), "" + round); log.info("... end of computing the error at round " + round); prevError = error; } } if (CALCULATE_ERR_ATTHEEND) { log.info("Computing the error at round " + round + " ..."); ReconstructionErrJob errJob = new ReconstructionErrJob(); error = errJob.reconstructionErr(distY, distY2X, distC, centralC, ym, xm, errSampleRate, conf, getTempPath(), "" + round); log.info("... end of computing the error at round " + round); } initVal.C = centralC; initVal.ss = ss; writeMatrix(initVal.C, output, getTempPath(), "PCs"); return error; }
From source file:org.qcri.pca.SPCADriver.java
/** * Run PPCA sequentially given the small input Y which fit into memory This * could be used also on sampled data from a distributed matrix * /* w w w .j a v a2 s. c o m*/ * Note: this implementation ignore NaN values by replacing them with 0 * * @param conf * the configuration * @param centralY * the input matrix * @param initVal * the initial values for C and ss * @param MAX_ROUNDS * maximum number of iterations * @return the error * @throws Exception */ double runSequential(Configuration conf, Matrix centralY, InitialValues initVal, final int MAX_ROUNDS) throws Exception { Matrix centralC = initVal.C; double ss = initVal.ss; final int nRows = centralY.numRows(); final int nCols = centralY.numCols(); final int nPCs = centralC.numCols(); final float threshold = 0.00001f; log.info("tracec= " + PCACommon.trace(centralC)); //ignore NaN elements by replacing them with 0 for (int r = 0; r < nRows; r++) for (int c = 0; c < nCols; c++) if (new Double(centralY.getQuick(r, c)).isNaN()) { centralY.setQuick(r, c, 0); } //centralize and normalize the input matrix Vector mean = centralY.aggregateColumns(new VectorFunction() { @Override public double apply(Vector v) { return v.zSum() / nRows; } }); //also normalize the matrix by dividing each element by its columns range Vector spanVector = new DenseVector(nCols); for (int c = 0; c < nCols; c++) { Vector col = centralY.viewColumn(c); double max = col.maxValue(); double min = col.minValue(); double span = max - min; spanVector.setQuick(c, span); } for (int r = 0; r < nRows; r++) for (int c = 0; c < nCols; c++) centralY.set(r, c, (centralY.get(r, c) - mean.get(c)) / (spanVector.getQuick(c) != 0 ? spanVector.getQuick(c) : 1)); Matrix centralCtC = centralC.transpose().times(centralC); log.info("tracectc= " + PCACommon.trace(centralCtC)); log.info("traceinvctc= " + PCACommon.trace(inv(centralCtC))); log.info("traceye= " + PCACommon.trace(centralY)); log.info("SSSSSSSSSSSSSSSSSSSSSSSSSSSS " + ss); int count = 1; // old = Inf; double old = Double.MAX_VALUE; // -------------------------- EM Iterations // while count Matrix centralX = null; int round = 0; while (round < MAX_ROUNDS && count > 0) { round++; // Sx = inv( eye(d) + CtC/ss ); Matrix Sx = eye(nPCs).times(ss).plus(centralCtC); Sx = inv(Sx); // X = Ye*C*(Sx/ss); centralX = centralY.times(centralC).times(Sx.transpose()); // XtX = X'*X + ss * Sx; Matrix centralXtX = centralX.transpose().times(centralX).plus(Sx.times(ss)); // C = (Ye'*X) / XtX; Matrix tmpInv = inv(centralXtX); centralC = centralY.transpose().times(centralX).times(tmpInv); // CtC = C'*C; centralCtC = centralC.transpose().times(centralC); // ss = ( sum(sum( (X*C'-Ye).^2 )) + trace(XtX*CtC) - 2*xcty ) /(N*D); double norm2 = centralY.clone().assign(new DoubleFunction() { @Override public double apply(double arg1) { return arg1 * arg1; } }).zSum(); ss = norm2 + PCACommon.trace(centralXtX.times(centralCtC)); //ss3 = sum (X(i:0) * C' * Y(i,:)') DenseVector resVector = new DenseVector(nCols); double xctyt = 0; for (int i = 0; i < nRows; i++) { PCACommon.vectorTimesMatrixTranspose(centralX.viewRow(i), centralC, resVector); double res = resVector.dot(centralY.viewRow(i)); xctyt += res; } ss -= 2 * xctyt; ss /= (nRows * nCols); log.info("SSSSSSSSSSSSSSSSSSSSSSSSSSSS " + ss); double traceSx = PCACommon.trace(Sx); double traceX = PCACommon.trace(centralX); double traceSumXtX = PCACommon.trace(centralXtX); double traceC = PCACommon.trace(centralC); double traceCtC = PCACommon.trace(centralCtC); log.info("TTTTTTTTTTTTTTTTT " + traceSx + " " + traceX + " " + traceSumXtX + " " + traceC + " " + traceCtC + " " + 0); double objective = ss; double rel_ch = Math.abs(1 - objective / old); old = objective; count++; if (rel_ch < threshold && count > 5) count = 0; log.info("Objective: %.6f relative change: %.6f \n", objective, rel_ch); } double norm1Y = centralY.aggregateColumns(new VectorNorm1()).maxValue(); log.info("Norm1 of Ye is: " + norm1Y); Matrix newYerror = centralY.minus(centralX.times(centralC.transpose())); double norm1Err = newYerror.aggregateColumns(new VectorNorm1()).maxValue(); log.info("Norm1 of the reconstruction error is: " + norm1Err); initVal.C = centralC; initVal.ss = ss; return norm1Err / norm1Y; }
From source file:org.qcri.pca.SPCADriver.java
/** * Run PPCA sequentially given the small input Y which fit into memory This * could be used also on sampled data from a distributed matrix * //from w ww. j a v a2 s.c o m * Note: this implementation ignore NaN values by replacing them with 0 * * @param conf * the configuration * @param centralY * the input matrix * @param initVal * the initial values for C and ss * @param MAX_ROUNDS * maximum number of iterations * @return the error * @throws Exception */ double runSequential_JacobVersion(Configuration conf, Matrix centralY, InitialValues initVal, final int MAX_ROUNDS) { Matrix centralC = initVal.C;// the current implementation doesn't use initial ss of // initVal final int nRows = centralY.numRows(); final int nCols = centralY.numCols(); final int nPCs = centralC.numCols(); final float threshold = 0.00001f; log.info("tracec= " + PCACommon.trace(centralC)); // Y = Y - mean(Ye) // Also normalize the matrix for (int r = 0; r < nRows; r++) for (int c = 0; c < nCols; c++) if (new Double(centralY.getQuick(r, c)).isNaN()) { centralY.setQuick(r, c, 0); } Vector mean = centralY.aggregateColumns(new VectorFunction() { @Override public double apply(Vector v) { return v.zSum() / nRows; } }); Vector spanVector = new DenseVector(nCols); for (int c = 0; c < nCols; c++) { Vector col = centralY.viewColumn(c); double max = col.maxValue(); double min = col.minValue(); double span = max - min; spanVector.setQuick(c, span); } for (int r = 0; r < nRows; r++) for (int c = 0; c < nCols; c++) centralY.set(r, c, (centralY.get(r, c) - mean.get(c)) / (spanVector.getQuick(c) != 0 ? spanVector.getQuick(c) : 1)); // -------------------------- initialization // CtC = C'*C; Matrix centralCtC = centralC.transpose().times(centralC); log.info("tracectc= " + PCACommon.trace(centralCtC)); log.info("traceinvctc= " + PCACommon.trace(inv(centralCtC))); log.info("traceye= " + PCACommon.trace(centralY)); // X = Ye * C * inv(CtC); Matrix centralX = centralY.times(centralC).times(inv(centralCtC)); log.info("tracex= " + PCACommon.trace(centralX)); // recon = X * C'; Matrix recon = centralX.times(centralC.transpose()); log.info("tracerec= " + PCACommon.trace(recon)); // ss = sum(sum((recon-Ye).^2)) / (N*D-missing); double ss = recon.minus(centralY).assign(new DoubleFunction() { @Override public double apply(double arg1) { return arg1 * arg1; } }).zSum() / (nRows * nCols); log.info("SSSSSSSSSSSSSSSSSSSSSSSSSSSS " + ss); int count = 1; // old = Inf; double old = Double.MAX_VALUE; // -------------------------- EM Iterations // while count int round = 0; while (round < MAX_ROUNDS && count > 0) { round++; // ------------------ E-step, (co)variances // Sx = inv( eye(d) + CtC/ss ); Matrix centralSx = eye(nPCs).plus(centralCtC.divide(ss)); centralSx = inv(centralSx); // ------------------ E-step expected value // X = Ye*C*(Sx/ss); centralX = centralY.times(centralC).times(centralSx.divide(ss)); // ------------------ M-step // SumXtX = X'*X; Matrix centralSumXtX = centralX.transpose().times(centralX); // C = (Ye'*X) / (SumXtX + N*Sx ); Matrix tmpInv = inv(centralSumXtX.plus(centralSx.times(nRows))); centralC = centralY.transpose().times(centralX).times(tmpInv); // CtC = C'*C; centralCtC = centralC.transpose().times(centralC); // ss = ( sum(sum( (X*C'-Ye).^2 )) + N*sum(sum(CtC.*Sx)) + // missing*ss_old ) /(N*D); recon = centralX.times(centralC.transpose()); double error = recon.minus(centralY).assign(new DoubleFunction() { @Override public double apply(double arg1) { return arg1 * arg1; } }).zSum(); ss = error + nRows * dot(centralCtC.clone(), centralSx).zSum(); ss /= (nRows * nCols); log.info("SSSSSSSSSSSSSSSSSSSSSSSSSSSS " + ss); double traceSx = PCACommon.trace(centralSx); double traceX = PCACommon.trace(centralX); double traceSumXtX = PCACommon.trace(centralSumXtX); double traceC = PCACommon.trace(centralC); double traceCtC = PCACommon.trace(centralCtC); log.info("TTTTTTTTTTTTTTTTT " + traceSx + " " + traceX + " " + traceSumXtX + " " + traceC + " " + traceCtC + " " + 0); // objective = N*D + N*(D*log(ss) +PCACommon.trace(Sx)-log(det(Sx)) ) // +PCACommon.trace(SumXtX) -missing*log(ss_old); double objective = nRows * nCols + nRows * (nCols * Math.log(ss) + PCACommon.trace(centralSx) - Math.log(centralSx.determinant())) + PCACommon.trace(centralSumXtX); double rel_ch = Math.abs(1 - objective / old); old = objective; count++; if (rel_ch < threshold && count > 5) count = 0; System.out.printf("Objective: %.6f relative change: %.6f \n", objective, rel_ch); } double norm1Y = centralY.aggregateColumns(new VectorNorm1()).maxValue(); log.info("Norm1 of Y is: " + norm1Y); Matrix newYerror = centralY.minus(centralX.times(centralC.transpose())); double norm1Err = newYerror.aggregateColumns(new VectorNorm1()).maxValue(); log.info("Norm1 of the reconstruction error is: " + norm1Err); initVal.C = centralC; initVal.ss = ss; return norm1Err / norm1Y; }
From source file:org.qcri.sparkpca.SparkPCA.java
/** * Compute principal component analysis where the input is an RDD<org.apache.spark.mllib.linalg.Vector> of vectors such that each vector represents a row in the matrix * @param sc // w ww .j a va 2 s . co m * Spark context that contains the configuration parameters and represents connection to the cluster * (used to create RDDs, accumulators and broadcast variables on that cluster) * @param vectors * An RDD of vectors representing the rows of the input matrix * @param nRows * Number of rows in input Matrix * @param nCols * Number of columns in input Matrix * @param nPCs * Number of desired principal components * @param errRate * The sampling rate that is used for computing the reconstruction error * @param maxIterations * Maximum number of iterations before terminating * @return Matrix of size nCols X nPCs having the desired principal components */ public static org.apache.spark.mllib.linalg.Matrix computePrincipalComponents(JavaSparkContext sc, JavaRDD<org.apache.spark.mllib.linalg.Vector> vectors, String outputPath, final int nRows, final int nCols, final int nPCs, final double errRate, final int maxIterations, final int computeProjectedMatrix) { System.out.println("Rows: " + nRows + ", Columns " + nCols); double ss; Matrix centralC; // The two PPCA variables that improve over each iteration: Principal component matrix (C) and random variance (ss). // Initialized randomly for the first iteration ss = PCAUtils.randSS(); centralC = PCAUtils.randomMatrix(nCols, nPCs); // 1. Mean Job : This job calculates the mean and span of the columns of the input RDD<org.apache.spark.mllib.linalg.Vector> final Accumulator<double[]> matrixAccumY = sc.accumulator(new double[nCols], new VectorAccumulatorParam()); final double[] internalSumY = new double[nCols]; vectors.foreachPartition(new VoidFunction<Iterator<org.apache.spark.mllib.linalg.Vector>>() { public void call(Iterator<org.apache.spark.mllib.linalg.Vector> arg0) throws Exception { org.apache.spark.mllib.linalg.Vector yi; int[] indices = null; int i; while (arg0.hasNext()) { yi = arg0.next(); indices = ((SparseVector) yi).indices(); for (i = 0; i < indices.length; i++) { internalSumY[indices[i]] += yi.apply(indices[i]); } } matrixAccumY.add(internalSumY); } });//End Mean Job //Get the sum of column Vector from the accumulator and divide each element by the number of rows to get the mean final Vector meanVector = new DenseVector(matrixAccumY.value()).divide(nRows); final Broadcast<Vector> br_ym_mahout = sc.broadcast(meanVector); //2. Frobenious Norm Job : Obtain Frobenius norm of the input RDD<org.apache.spark.mllib.linalg.Vector> /** * To compute the norm2 of a sparse matrix, iterate over sparse items and sum * square of the difference. After processing each row, add the sum of the * meanSquare of the zero-elements that were ignored in the sparse iteration. */ double meanSquareSumTmp = 0; for (int i = 0; i < br_ym_mahout.value().size(); i++) { double element = br_ym_mahout.value().getQuick(i); meanSquareSumTmp += element * element; } final double meanSquareSum = meanSquareSumTmp; final Accumulator<Double> doubleAccumNorm2 = sc.accumulator(0.0); vectors.foreach(new VoidFunction<org.apache.spark.mllib.linalg.Vector>() { public void call(org.apache.spark.mllib.linalg.Vector yi) throws Exception { double norm2 = 0; double meanSquareSumOfZeroElements = meanSquareSum; int[] indices = ((SparseVector) yi).indices(); int i; int index; double v; //iterate over non for (i = 0; i < indices.length; i++) { index = indices[i]; v = yi.apply(index); double mean = br_ym_mahout.value().getQuick(index); double diff = v - mean; diff *= diff; // cancel the effect of the non-zero element in meanSquareSum meanSquareSumOfZeroElements -= mean * mean; norm2 += diff; } // For all all zero items, the following has the sum of mean square norm2 += meanSquareSumOfZeroElements; doubleAccumNorm2.add(norm2); } });//end Frobenious Norm Job double norm2 = doubleAccumNorm2.value(); log.info("NOOOOORM2=" + norm2); // initial CtC Matrix centralCtC = centralC.transpose().times(centralC); Matrix centralY2X = null; // -------------------------- EM Iterations // while count int round = 0; double prevObjective = Double.MAX_VALUE; double error = 0; double relChangeInObjective = Double.MAX_VALUE; double prevError = Double.MAX_VALUE; final float threshold = 0.00001f; final int firstRound = round; for (; (round < maxIterations && relChangeInObjective > threshold && prevError > 0.02); round++) { // Sx = inv( ss * eye(d) + CtC ); Matrix centralSx = centralCtC.clone(); // Sx = inv( eye(d) + CtC/ss ); centralSx.viewDiagonal().assign(Functions.plus(ss)); centralSx = PCAUtils.inv(centralSx); // X = Y * C * Sx' => Y2X = C * Sx' centralY2X = centralC.times(centralSx.transpose()); //Broadcast Y2X because it will be used in several jobs and several iterations. final Broadcast<Matrix> br_centralY2X = sc.broadcast(centralY2X); // Xm = Ym * Y2X Vector xm_mahout = new DenseVector(nPCs); xm_mahout = PCAUtils.denseVectorTimesMatrix(br_ym_mahout.value(), centralY2X, xm_mahout); //Broadcast Xm because it will be used in several iterations. final Broadcast<Vector> br_xm_mahout = sc.broadcast(xm_mahout); // We skip computing X as we generate it on demand using Y and Y2X // 3. X'X and Y'X Job: The job computes the two matrices X'X and Y'X /** * Xc = Yc * MEM (MEM is the in-memory broadcasted matrix Y2X) * * XtX = Xc' * Xc * * YtX = Yc' * Xc * * It also considers that Y is sparse and receives the mean vectors Ym and Xm * separately. * * Yc = Y - Ym * * Xc = X - Xm * * Xc = (Y - Ym) * MEM = Y * MEM - Ym * MEM = X - Xm * * XtX = (X - Xm)' * (X - Xm) * * YtX = (Y - Ym)' * (X - Xm) * */ final Accumulator<double[][]> matrixAccumXtx = sc.accumulator(new double[nPCs][nPCs], new MatrixAccumulatorParam()); final Accumulator<double[][]> matrixAccumYtx = sc.accumulator(new double[nCols][nPCs], new MatrixAccumulatorParam()); final Accumulator<double[]> matrixAccumX = sc.accumulator(new double[nPCs], new VectorAccumulatorParam()); /* * Initialize the output matrices and vectors once in order to avoid generating massive intermediate data in the workers */ final double[][] resArrayYtX = new double[nCols][nPCs]; final double[][] resArrayXtX = new double[nPCs][nPCs]; final double[] resArrayX = new double[nPCs]; /* * Used to sum the vectors in one partition. */ final double[][] internalSumYtX = new double[nCols][nPCs]; final double[][] internalSumXtX = new double[nPCs][nPCs]; final double[] internalSumX = new double[nPCs]; vectors.foreachPartition(new VoidFunction<Iterator<org.apache.spark.mllib.linalg.Vector>>() { public void call(Iterator<org.apache.spark.mllib.linalg.Vector> arg0) throws Exception { org.apache.spark.mllib.linalg.Vector yi; while (arg0.hasNext()) { yi = arg0.next(); /* * Perform in-memory matrix multiplication xi = yi' * Y2X */ PCAUtils.sparseVectorTimesMatrix(yi, br_centralY2X.value(), resArrayX); //get only the sparse indices int[] indices = ((SparseVector) yi).indices(); PCAUtils.outerProductWithIndices(yi, br_ym_mahout.value(), resArrayX, br_xm_mahout.value(), resArrayYtX, indices); PCAUtils.outerProductArrayInput(resArrayX, br_xm_mahout.value(), resArrayX, br_xm_mahout.value(), resArrayXtX); int i, j, rowIndexYtX; //add the sparse indices only for (i = 0; i < indices.length; i++) { rowIndexYtX = indices[i]; for (j = 0; j < nPCs; j++) { internalSumYtX[rowIndexYtX][j] += resArrayYtX[rowIndexYtX][j]; resArrayYtX[rowIndexYtX][j] = 0; //reset it } } for (i = 0; i < nPCs; i++) { internalSumX[i] += resArrayX[i]; for (j = 0; j < nPCs; j++) { internalSumXtX[i][j] += resArrayXtX[i][j]; resArrayXtX[i][j] = 0; //reset it } } } matrixAccumX.add(internalSumX); matrixAccumXtx.add(internalSumXtX); matrixAccumYtx.add(internalSumYtX); } });//end X'X and Y'X Job /* * Get the values of the accumulators. */ Matrix centralYtX = new DenseMatrix(matrixAccumYtx.value()); Matrix centralXtX = new DenseMatrix(matrixAccumXtx.value()); Vector centralSumX = new DenseVector(matrixAccumX.value()); /* * Mi = (Yi-Ym)' x (Xi-Xm) = Yi' x (Xi-Xm) - Ym' x (Xi-Xm) * * M = Sum(Mi) = Sum(Yi' x (Xi-Xm)) - Ym' x (Sum(Xi)-N*Xm) * * The first part is done in the previous job and the second in the following method */ centralYtX = PCAUtils.updateXtXAndYtx(centralYtX, centralSumX, br_ym_mahout.value(), xm_mahout, nRows); centralXtX = PCAUtils.updateXtXAndYtx(centralXtX, centralSumX, xm_mahout, xm_mahout, nRows); // XtX = X'*X + ss * Sx final double finalss = ss; centralXtX.assign(centralSx, new DoubleDoubleFunction() { @Override public double apply(double arg1, double arg2) { return arg1 + finalss * arg2; } }); // C = (Ye'*X) / SumXtX; Matrix invXtX_central = PCAUtils.inv(centralXtX); centralC = centralYtX.times(invXtX_central); centralCtC = centralC.transpose().times(centralC); // Compute new value for ss // ss = ( sum(sum(Ye.^2)) + trace(XtX*CtC) - 2sum(XiCtYit))/(N*D); //4. Variance Job: Computes part of variance that requires a distributed job /** * xcty = Sum (xi * C' * yi') * * We also regenerate xi on demand by the following formula: * * xi = yi * y2x * * To make it efficient for sparse matrices that are not mean-centered, we receive the mean * separately: * * xi = (yi - ym) * y2x = yi * y2x - xm, where xm = ym*y2x * * xi * C' * (yi-ym)' = xi * ((yi-ym)*C)' = xi * (yi*C - ym*C)' * */ double ss2 = PCAUtils.trace(centralXtX.times(centralCtC)); final double[] resArrayYmC = new double[centralC.numCols()]; PCAUtils.denseVectorTimesMatrix(meanVector, centralC, resArrayYmC); final Broadcast<Matrix> br_centralC = sc.broadcast(centralC); final double[] resArrayYiC = new double[centralC.numCols()]; final Accumulator<Double> doubleAccumXctyt = sc.accumulator(0.0); vectors.foreach(new VoidFunction<org.apache.spark.mllib.linalg.Vector>() { public void call(org.apache.spark.mllib.linalg.Vector yi) throws Exception { PCAUtils.sparseVectorTimesMatrix(yi, br_centralY2X.value(), resArrayX); PCAUtils.sparseVectorTimesMatrix(yi, br_centralC.value(), resArrayYiC); PCAUtils.subtract(resArrayYiC, resArrayYmC); double dotRes = PCAUtils.dot(resArrayX, resArrayYiC); doubleAccumXctyt.add(dotRes); } }); //end Variance Job double xctyt = doubleAccumXctyt.value(); ss = (norm2 + ss2 - 2 * xctyt) / (nRows * nCols); log.info("SSSSSSSSSSSSSSSSSSSSSSSSSSSS " + ss + " (" + norm2 + " + " + ss2 + " -2* " + xctyt); double objective = ss; relChangeInObjective = Math.abs(1 - objective / prevObjective); prevObjective = objective; log.info("Objective: %.6f relative change: %.6f \n", objective, relChangeInObjective); if (!CALCULATE_ERR_ATTHEEND) { log.info("Computing the error at round " + round + " ..."); System.out.println("Computing the error at round " + round + " ..."); double[] resArrayZmTmp = new double[centralC.numRows()]; PCAUtils.vectorTimesMatrixTranspose(xm_mahout, centralC, resArrayZmTmp); final double[] resArrayZm = PCAUtils.subtractVectorArray(resArrayZmTmp, meanVector); //5. Reconstruction Error Job: The job computes the reconstruction error of the input matrix after updating the principal //components matrix /** * Xc = Yc * Y2X * * ReconY = Xc * C' * * Err = ReconY - Yc * * Norm2(Err) = abs(Err).zSum().max() * * To take the sparse matrixes into account we receive the mean separately: * * X = (Y - Ym) * Y2X = X - Xm, where X=Y*Y2X and Xm=Ym*Y2X * * ReconY = (X - Xm) * C' = X*C' - Xm*C' * * Err = X*C' - Xm*C' - (Y - Ym) = X*C' - Y - Zm, where where Zm=Xm*C'-Ym * */ final Accumulator<double[]> vectorAccumErr = sc.accumulator(new double[nCols], new VectorAccumulatorAbsParam()); final Accumulator<double[]> vectorAccumNormCentralized = sc.accumulator(new double[nCols], new VectorAccumulatorAbsParam()); final double[] xiCtArray = new double[centralC.numRows()]; final double[] normalizedArray = new double[nCols]; vectors.foreach(new VoidFunction<org.apache.spark.mllib.linalg.Vector>() { public void call(org.apache.spark.mllib.linalg.Vector yi) throws Exception { if (PCAUtils.pass(errRate)) return; PCAUtils.sparseVectorTimesMatrix(yi, br_centralY2X.value(), resArrayX); PCAUtils.vectorTimesMatrixTranspose(resArrayX, br_centralC.value(), xiCtArray); PCAUtils.denseVectorSubtractSparseSubtractDense(xiCtArray, yi, resArrayZm); vectorAccumErr.add(xiCtArray); PCAUtils.denseVectorMinusVector(yi, br_ym_mahout.value(), normalizedArray); vectorAccumNormCentralized.add(normalizedArray); } }); // end Reconstruction Error Job /* * The norm of the reconstruction error matrix */ double reconstructionError = PCAUtils.getMax(vectorAccumErr.value()); log.info("************************ReconsructionError=" + reconstructionError); /* * The norm of the input matrix after centralization */ double centralizedYNorm = PCAUtils.getMax(vectorAccumNormCentralized.value()); log.info("************************CentralizedNOrm=" + centralizedYNorm); error = reconstructionError / centralizedYNorm; log.info("... end of computing the error at round " + round + " And error=" + error); prevError = error; } } if (computeProjectedMatrix == 1) { //7. Compute Projected Matrix Job: The job multiplies the input matrix Y with the principal components C to get the Projected vectors final Broadcast<Matrix> br_centralC = sc.broadcast(centralC); JavaRDD<org.apache.spark.mllib.linalg.Vector> projectedVectors = vectors.map( new Function<org.apache.spark.mllib.linalg.Vector, org.apache.spark.mllib.linalg.Vector>() { public org.apache.spark.mllib.linalg.Vector call(org.apache.spark.mllib.linalg.Vector yi) throws Exception { return PCAUtils.sparseVectorTimesMatrix(yi, br_centralC.value()); } });//End Compute Projected Matrix String path = outputPath + File.separator + "ProjectedMatrix"; projectedVectors.saveAsTextFile(path); } return PCAUtils.convertMahoutToSparkMatrix(centralC); }