List of usage examples for org.apache.mahout.math.hadoop DistributedRowMatrix numCols
int numCols
To view the source code for org.apache.mahout.math.hadoop DistributedRowMatrix numCols.
Click Source Link
From source file:com.twitter.algebra.AlgebraCommon.java
License:Apache License
/*** * If the MapDir matrix is small, we can convert it to an in memory representation * and then run efficient centralized operations * //from w ww. j a v a 2 s. c om * @param origMtx in MapDir format (generated by MatrixOutputFormat) * @return a dense matrix including the data * @throws IOException */ public static DenseMatrix toDenseMatrix(DistributedRowMatrix origMtx) throws IOException { MapDir mapDir = new MapDir(new Configuration(), origMtx.getRowPath()); DenseMatrix mtx = new DenseMatrix(origMtx.numRows(), origMtx.numCols()); Iterator<MatrixSlice> sliceIterator; try { sliceIterator = mapDir.iterateAll(); } catch (Exception e) { log.info(e.toString()); log.info("Input is not in matrix format, trying SequenceFileFormat instead ..."); sliceIterator = origMtx.iterateAll(); } while (sliceIterator.hasNext()) { MatrixSlice slice = sliceIterator.next(); // int r = slice.index(); // for (int c = 0; c < mtx.numCols(); c++) { // mtx.set(r, c, slice.get(c)); // } mtx.viewRow(slice.index()).assign(slice.vector()); } mapDir.close(); return mtx; }
From source file:com.twitter.algebra.AlgebraCommon.java
License:Apache License
/*** * If the MapDir matrix is small, we can convert it to an in memory representation * and then run efficient centralized operations * /*from w ww . j a va 2s .c o m*/ * @param origMtx in MapDir format (generated by MatrixOutputFormat) * @return a dense matrix including the data * @throws IOException */ static SparseMatrix toSparseMatrix(DistributedRowMatrix origMtx) throws IOException { MapDir mapDir = new MapDir(new Configuration(), origMtx.getRowPath()); SparseMatrix mtx = new SparseMatrix(origMtx.numRows(), origMtx.numCols()); Iterator<MatrixSlice> sliceIterator = mapDir.iterateAll(); while (sliceIterator.hasNext()) { MatrixSlice slice = sliceIterator.next(); mtx.viewRow(slice.index()).assign(slice.vector()); } mapDir.close(); return mtx; }
From source file:com.twitter.algebra.matrix.format.Sequence2MatrixFormatJob.java
License:Apache License
public static DistributedRowMatrix run(Configuration conf, DistributedRowMatrix A, String label) throws IOException, InterruptedException, ClassNotFoundException { log.info("running " + Sequence2MatrixFormatJob.class.getName()); Path outPath = new Path(A.getOutputTempPath(), label); FileSystem fs = FileSystem.get(outPath.toUri(), conf); Sequence2MatrixFormatJob job = new Sequence2MatrixFormatJob(); if (!fs.exists(outPath)) { job.run(conf, A.getRowPath(), outPath); } else {//ww w . jav a 2 s . c o m log.warn("----------- Skip already exists: " + outPath); } DistributedRowMatrix distRes = new DistributedRowMatrix(outPath, A.getOutputTempPath(), A.numRows(), A.numCols()); distRes.setConf(conf); return distRes; }
From source file:com.twitter.algebra.matrix.multiply.ABInnerHDFSBroadcastOfB.java
License:Apache License
/** * Perform A x B, where A and B are already wrapped in a DistributedRowMatrix * object. Refer to {@link ABInnerHDFSBroadcastOfB} for further details. * //from w w w .jav a 2s. c o m * @param conf the initial configuration * @param A matrix A * @param B matrix B * @param label the label for the output directory * @return AxB wrapped in a DistributedRowMatrix object * @throws IOException * @throws InterruptedException * @throws ClassNotFoundException */ public static DistributedRowMatrix run(Configuration conf, DistributedRowMatrix A, DistributedRowMatrix B, String label) throws IOException, InterruptedException, ClassNotFoundException { log.info("running " + ABInnerHDFSBroadcastOfB.class.getName()); if (A.numCols() != B.numRows()) { throw new CardinalityException(A.numCols(), B.numRows()); } Path outPath = new Path(A.getOutputTempPath(), label); FileSystem fs = FileSystem.get(outPath.toUri(), conf); ABInnerHDFSBroadcastOfB job = new ABInnerHDFSBroadcastOfB(); if (!fs.exists(outPath)) { job.run(conf, A.getRowPath(), B.getRowPath(), outPath, B.numRows(), B.numCols()); } else { log.warn("----------- Skip already exists: " + outPath); } DistributedRowMatrix distRes = new DistributedRowMatrix(outPath, A.getOutputTempPath(), A.numRows(), B.numCols()); distRes.setConf(conf); return distRes; }
From source file:com.twitter.algebra.matrix.multiply.ABOuterHDFSBroadcastOfA.java
License:Apache License
/** * Perform A x B, where A and B are already wrapped in a DistributedRowMatrix * object. Refer to {@link ABOuterHDFSBroadcastOfA} for further details. * /* w ww . j a v a 2 s . c om*/ * @param conf * the initial configuration * @param A * matrix A * @param B * matrix B * @param label * the label for the output directory * @return AxB wrapped in a DistributedRowMatrix object * @throws IOException * @throws InterruptedException * @throws ClassNotFoundException */ public static DistributedRowMatrix run(Configuration conf, DistributedRowMatrix A, DistributedRowMatrix B, String label) throws IOException, InterruptedException, ClassNotFoundException { log.info("running " + ABOuterHDFSBroadcastOfA.class.getName()); if (A.numCols() != B.numRows()) { throw new CardinalityException(A.numCols(), B.numRows()); } Path outPath = new Path(A.getOutputTempPath(), label); FileSystem fs = FileSystem.get(outPath.toUri(), conf); ABOuterHDFSBroadcastOfA job = new ABOuterHDFSBroadcastOfA(); if (!fs.exists(outPath)) { job.run(conf, A.getRowPath(), B.getRowPath(), outPath, A.numRows(), A.numCols()); } else { log.warn("----------- Skip already exists: " + outPath); } DistributedRowMatrix distRes = new DistributedRowMatrix(outPath, A.getOutputTempPath(), A.numRows(), B.numCols()); distRes.setConf(conf); return distRes; }
From source file:com.twitter.algebra.matrix.multiply.AtBOuterStaticMapsideJoinJob.java
License:Apache License
public static DistributedRowMatrix run(Configuration conf, DistributedRowMatrix A, DistributedRowMatrix B, String label) throws IOException, InterruptedException, ClassNotFoundException { log.info("running " + AtBOuterStaticMapsideJoinJob.class.getName()); if (A.numRows() != B.numRows()) { throw new CardinalityException(A.numRows(), B.numRows()); }// w w w. j ava2 s. co m Path outPath = new Path(A.getOutputTempPath(), label); FileSystem fs = FileSystem.get(outPath.toUri(), conf); AtBOuterStaticMapsideJoinJob job = new AtBOuterStaticMapsideJoinJob(); if (!fs.exists(outPath)) { job.run(conf, A.getRowPath(), B.getRowPath(), outPath, B.numCols()); } else { log.warn("----------- Skip already exists: " + outPath); } DistributedRowMatrix distRes = new DistributedRowMatrix(outPath, A.getOutputTempPath(), A.numCols(), B.numCols()); distRes.setConf(conf); return distRes; }
From source file:com.twitter.algebra.matrix.multiply.AtB_DMJ.java
License:Apache License
/** * Perform A x B, where At and B are already wrapped in a DistributedRowMatrix * object. Refer to {@link AtB_DMJ} for further details. * /*ww w .ja va 2 s. co m*/ * Automatically decide on partitioning the larger matrix to be used with * in-memory combiners. * * @param conf the initial configuration * @param At transpose of matrix A * @param B matrix B * @param label the label for the output directory * @param labelAtCol by using a fixed label for AtCol one can avoid the second * run of the partitioning job if we know that At is not changed * @param lableBCol by using a fixed label for BCol one can avoid the second * run of the partitioning job if we know that B is not changed * @return AxB wrapped in a DistributedRowMatrix object * @throws IOException * @throws InterruptedException * @throws ClassNotFoundException */ public static DistributedRowMatrix smartRun(Configuration conf, DistributedRowMatrix At, DistributedRowMatrix B, String label, String labelAtCol, String lableBCol) throws IOException, InterruptedException, ClassNotFoundException { log.info("running " + AtB_DMJ.class.getName()); if (At.numRows() != B.numRows()) throw new CardinalityException(At.numRows(), B.numRows()); Path outPath = new Path(At.getOutputTempPath(), label); FileSystem fs = FileSystem.get(outPath.toUri(), conf); AtB_DMJ job = new AtB_DMJ(); if (!fs.exists(outPath)) { int numColPartitionsAt = 1, numColPartitionsB = 1; int numColPartitions = NMFCommon.computeOptColPartitionsForMemCombiner(conf, At.numCols(), B.numCols()); long atSize = MapDir.du(At.getRowPath(), fs); long bSize = MapDir.du(B.getRowPath(), fs); //cost is size of remote reads. For each col partition we need to read the entire of the other matrix once long atPartitionCost = numColPartitions * bSize; long bPartitionCost = numColPartitions * atSize; log.info("smart partitioning: numColPartitions: " + numColPartitions + " atSize: " + atSize + " bSize: " + bSize + " atCost=" + atPartitionCost + " vs. bCost=" + bPartitionCost); if (atPartitionCost < bPartitionCost) { At = ColPartitionJob.partition(At, conf, labelAtCol, numColPartitions); numColPartitionsAt = numColPartitions; } else { B = ColPartitionJob.partition(B, conf, lableBCol, numColPartitions); numColPartitionsB = numColPartitions; } job.run(conf, At.getRowPath(), B.getRowPath(), outPath, At.numCols(), B.numCols(), numColPartitionsAt, numColPartitionsB, true); } else { log.warn("----------- Skip already exists: " + outPath); } DistributedRowMatrix distRes = new DistributedRowMatrix(outPath, At.getOutputTempPath(), At.numCols(), B.numCols()); distRes.setConf(conf); return distRes; }
From source file:com.twitter.algebra.matrix.multiply.AtB_DMJ.java
License:Apache License
/** * Perform A x B, where At and B are already wrapped in a DistributedRowMatrix * object. Refer to {@link AtB_DMJ} for further details. * //w ww . ja va 2 s. c o m * @param conf the initial configuration * @param At transpose of matrix A * @param B matrix B * @param numColPartitionsAt * @param numColPartitionsB * @param label the label for the output directory * @param useInMemCombiner * @return AxB wrapped in a DistributedRowMatrix object * @throws IOException * @throws InterruptedException * @throws ClassNotFoundException */ public static DistributedRowMatrix run(Configuration conf, DistributedRowMatrix At, DistributedRowMatrix B, int numColPartitionsAt, int numColPartitionsB, String label, boolean useInMemCombiner) throws IOException, InterruptedException, ClassNotFoundException { log.info("running " + AtB_DMJ.class.getName()); if (At.numRows() != B.numRows()) throw new CardinalityException(At.numRows(), B.numRows()); if (numColPartitionsAt != 1 && numColPartitionsB != 1) throw new IOException("AtB_DMJ: not both At and B can be column partitioned!"); Path outPath = new Path(At.getOutputTempPath(), label); FileSystem fs = FileSystem.get(outPath.toUri(), conf); AtB_DMJ job = new AtB_DMJ(); if (!fs.exists(outPath)) { job.run(conf, At.getRowPath(), B.getRowPath(), outPath, At.numCols(), B.numCols(), numColPartitionsAt, numColPartitionsB, useInMemCombiner); } else { log.warn("----------- Skip already exists: " + outPath); } DistributedRowMatrix distRes = new DistributedRowMatrix(outPath, At.getOutputTempPath(), At.numCols(), B.numCols()); distRes.setConf(conf); return distRes; }
From source file:com.twitter.algebra.matrix.multiply.PartitionerJob.java
License:Apache License
public static DistributedRowMatrix run(Configuration conf, DistributedRowMatrix A, int partitions, String label) throws IOException, InterruptedException, ClassNotFoundException { log.info("running " + PartitionerJob.class.getName()); Path outPath = new Path(A.getOutputTempPath(), label); FileSystem fs = FileSystem.get(outPath.toUri(), conf); PartitionerJob job = new PartitionerJob(); if (!fs.exists(outPath)) { job.run(conf, A.getRowPath(), outPath, A.numRows(), partitions); } else {/*from w w w . j a v a 2s . c o m*/ log.warn("----------- Skip already exists: " + outPath); } DistributedRowMatrix distRes = new DistributedRowMatrix(outPath, A.getOutputTempPath(), A.numRows(), A.numCols()); distRes.setConf(conf); return distRes; }
From source file:com.twitter.algebra.matrix.text.Matrix2TextJob.java
License:Apache License
public static DistributedRowMatrix run(Configuration conf, DistributedRowMatrix A, String label) throws IOException, InterruptedException, ClassNotFoundException { log.info("running " + Matrix2TextJob.class.getName()); Path outPath = new Path(A.getOutputTempPath(), label); FileSystem fs = FileSystem.get(outPath.toUri(), conf); Matrix2TextJob job = new Matrix2TextJob(); if (!fs.exists(outPath)) { job.run(conf, A.getRowPath(), outPath); } else {// w ww. ja v a 2 s.c om log.warn("----------- Skip already exists: " + outPath); } DistributedRowMatrix distRes = new DistributedRowMatrix(outPath, A.getOutputTempPath(), A.numRows(), A.numCols()); distRes.setConf(conf); return distRes; }