List of usage examples for org.apache.mahout.math.hadoop DistributedRowMatrix getOutputTempPath
public Path getOutputTempPath()
From source file:com.twitter.algebra.matrix.format.Sequence2MatrixFormatJob.java
License:Apache License
public static DistributedRowMatrix run(Configuration conf, DistributedRowMatrix A, String label) throws IOException, InterruptedException, ClassNotFoundException { log.info("running " + Sequence2MatrixFormatJob.class.getName()); Path outPath = new Path(A.getOutputTempPath(), label); FileSystem fs = FileSystem.get(outPath.toUri(), conf); Sequence2MatrixFormatJob job = new Sequence2MatrixFormatJob(); if (!fs.exists(outPath)) { job.run(conf, A.getRowPath(), outPath); } else {//from w w w .j a v a 2 s . c o m log.warn("----------- Skip already exists: " + outPath); } DistributedRowMatrix distRes = new DistributedRowMatrix(outPath, A.getOutputTempPath(), A.numRows(), A.numCols()); distRes.setConf(conf); return distRes; }
From source file:com.twitter.algebra.matrix.multiply.ABInnerHDFSBroadcastOfB.java
License:Apache License
/** * Perform A x B, where A and B are already wrapped in a DistributedRowMatrix * object. Refer to {@link ABInnerHDFSBroadcastOfB} for further details. * // ww w . j ava2 s. c om * @param conf the initial configuration * @param A matrix A * @param B matrix B * @param label the label for the output directory * @return AxB wrapped in a DistributedRowMatrix object * @throws IOException * @throws InterruptedException * @throws ClassNotFoundException */ public static DistributedRowMatrix run(Configuration conf, DistributedRowMatrix A, DistributedRowMatrix B, String label) throws IOException, InterruptedException, ClassNotFoundException { log.info("running " + ABInnerHDFSBroadcastOfB.class.getName()); if (A.numCols() != B.numRows()) { throw new CardinalityException(A.numCols(), B.numRows()); } Path outPath = new Path(A.getOutputTempPath(), label); FileSystem fs = FileSystem.get(outPath.toUri(), conf); ABInnerHDFSBroadcastOfB job = new ABInnerHDFSBroadcastOfB(); if (!fs.exists(outPath)) { job.run(conf, A.getRowPath(), B.getRowPath(), outPath, B.numRows(), B.numCols()); } else { log.warn("----------- Skip already exists: " + outPath); } DistributedRowMatrix distRes = new DistributedRowMatrix(outPath, A.getOutputTempPath(), A.numRows(), B.numCols()); distRes.setConf(conf); return distRes; }
From source file:com.twitter.algebra.matrix.multiply.ABOuterHDFSBroadcastOfA.java
License:Apache License
/** * Perform A x B, where A and B are already wrapped in a DistributedRowMatrix * object. Refer to {@link ABOuterHDFSBroadcastOfA} for further details. * /* w ww. j av a 2 s. c om*/ * @param conf * the initial configuration * @param A * matrix A * @param B * matrix B * @param label * the label for the output directory * @return AxB wrapped in a DistributedRowMatrix object * @throws IOException * @throws InterruptedException * @throws ClassNotFoundException */ public static DistributedRowMatrix run(Configuration conf, DistributedRowMatrix A, DistributedRowMatrix B, String label) throws IOException, InterruptedException, ClassNotFoundException { log.info("running " + ABOuterHDFSBroadcastOfA.class.getName()); if (A.numCols() != B.numRows()) { throw new CardinalityException(A.numCols(), B.numRows()); } Path outPath = new Path(A.getOutputTempPath(), label); FileSystem fs = FileSystem.get(outPath.toUri(), conf); ABOuterHDFSBroadcastOfA job = new ABOuterHDFSBroadcastOfA(); if (!fs.exists(outPath)) { job.run(conf, A.getRowPath(), B.getRowPath(), outPath, A.numRows(), A.numCols()); } else { log.warn("----------- Skip already exists: " + outPath); } DistributedRowMatrix distRes = new DistributedRowMatrix(outPath, A.getOutputTempPath(), A.numRows(), B.numCols()); distRes.setConf(conf); return distRes; }
From source file:com.twitter.algebra.matrix.multiply.AtBOuterStaticMapsideJoinJob.java
License:Apache License
public static DistributedRowMatrix run(Configuration conf, DistributedRowMatrix A, DistributedRowMatrix B, String label) throws IOException, InterruptedException, ClassNotFoundException { log.info("running " + AtBOuterStaticMapsideJoinJob.class.getName()); if (A.numRows() != B.numRows()) { throw new CardinalityException(A.numRows(), B.numRows()); }//from www. j a v a 2 s . c o m Path outPath = new Path(A.getOutputTempPath(), label); FileSystem fs = FileSystem.get(outPath.toUri(), conf); AtBOuterStaticMapsideJoinJob job = new AtBOuterStaticMapsideJoinJob(); if (!fs.exists(outPath)) { job.run(conf, A.getRowPath(), B.getRowPath(), outPath, B.numCols()); } else { log.warn("----------- Skip already exists: " + outPath); } DistributedRowMatrix distRes = new DistributedRowMatrix(outPath, A.getOutputTempPath(), A.numCols(), B.numCols()); distRes.setConf(conf); return distRes; }
From source file:com.twitter.algebra.matrix.multiply.AtB_DMJ.java
License:Apache License
/** * Perform A x B, where At and B are already wrapped in a DistributedRowMatrix * object. Refer to {@link AtB_DMJ} for further details. * /*from www . j av a2s .com*/ * Automatically decide on partitioning the larger matrix to be used with * in-memory combiners. * * @param conf the initial configuration * @param At transpose of matrix A * @param B matrix B * @param label the label for the output directory * @param labelAtCol by using a fixed label for AtCol one can avoid the second * run of the partitioning job if we know that At is not changed * @param lableBCol by using a fixed label for BCol one can avoid the second * run of the partitioning job if we know that B is not changed * @return AxB wrapped in a DistributedRowMatrix object * @throws IOException * @throws InterruptedException * @throws ClassNotFoundException */ public static DistributedRowMatrix smartRun(Configuration conf, DistributedRowMatrix At, DistributedRowMatrix B, String label, String labelAtCol, String lableBCol) throws IOException, InterruptedException, ClassNotFoundException { log.info("running " + AtB_DMJ.class.getName()); if (At.numRows() != B.numRows()) throw new CardinalityException(At.numRows(), B.numRows()); Path outPath = new Path(At.getOutputTempPath(), label); FileSystem fs = FileSystem.get(outPath.toUri(), conf); AtB_DMJ job = new AtB_DMJ(); if (!fs.exists(outPath)) { int numColPartitionsAt = 1, numColPartitionsB = 1; int numColPartitions = NMFCommon.computeOptColPartitionsForMemCombiner(conf, At.numCols(), B.numCols()); long atSize = MapDir.du(At.getRowPath(), fs); long bSize = MapDir.du(B.getRowPath(), fs); //cost is size of remote reads. For each col partition we need to read the entire of the other matrix once long atPartitionCost = numColPartitions * bSize; long bPartitionCost = numColPartitions * atSize; log.info("smart partitioning: numColPartitions: " + numColPartitions + " atSize: " + atSize + " bSize: " + bSize + " atCost=" + atPartitionCost + " vs. bCost=" + bPartitionCost); if (atPartitionCost < bPartitionCost) { At = ColPartitionJob.partition(At, conf, labelAtCol, numColPartitions); numColPartitionsAt = numColPartitions; } else { B = ColPartitionJob.partition(B, conf, lableBCol, numColPartitions); numColPartitionsB = numColPartitions; } job.run(conf, At.getRowPath(), B.getRowPath(), outPath, At.numCols(), B.numCols(), numColPartitionsAt, numColPartitionsB, true); } else { log.warn("----------- Skip already exists: " + outPath); } DistributedRowMatrix distRes = new DistributedRowMatrix(outPath, At.getOutputTempPath(), At.numCols(), B.numCols()); distRes.setConf(conf); return distRes; }
From source file:com.twitter.algebra.matrix.multiply.AtB_DMJ.java
License:Apache License
/** * Perform A x B, where At and B are already wrapped in a DistributedRowMatrix * object. Refer to {@link AtB_DMJ} for further details. * // w w w . j a v a 2 s . co m * @param conf the initial configuration * @param At transpose of matrix A * @param B matrix B * @param numColPartitionsAt * @param numColPartitionsB * @param label the label for the output directory * @param useInMemCombiner * @return AxB wrapped in a DistributedRowMatrix object * @throws IOException * @throws InterruptedException * @throws ClassNotFoundException */ public static DistributedRowMatrix run(Configuration conf, DistributedRowMatrix At, DistributedRowMatrix B, int numColPartitionsAt, int numColPartitionsB, String label, boolean useInMemCombiner) throws IOException, InterruptedException, ClassNotFoundException { log.info("running " + AtB_DMJ.class.getName()); if (At.numRows() != B.numRows()) throw new CardinalityException(At.numRows(), B.numRows()); if (numColPartitionsAt != 1 && numColPartitionsB != 1) throw new IOException("AtB_DMJ: not both At and B can be column partitioned!"); Path outPath = new Path(At.getOutputTempPath(), label); FileSystem fs = FileSystem.get(outPath.toUri(), conf); AtB_DMJ job = new AtB_DMJ(); if (!fs.exists(outPath)) { job.run(conf, At.getRowPath(), B.getRowPath(), outPath, At.numCols(), B.numCols(), numColPartitionsAt, numColPartitionsB, useInMemCombiner); } else { log.warn("----------- Skip already exists: " + outPath); } DistributedRowMatrix distRes = new DistributedRowMatrix(outPath, At.getOutputTempPath(), At.numCols(), B.numCols()); distRes.setConf(conf); return distRes; }
From source file:com.twitter.algebra.matrix.multiply.PartitionerJob.java
License:Apache License
public static DistributedRowMatrix run(Configuration conf, DistributedRowMatrix A, int partitions, String label) throws IOException, InterruptedException, ClassNotFoundException { log.info("running " + PartitionerJob.class.getName()); Path outPath = new Path(A.getOutputTempPath(), label); FileSystem fs = FileSystem.get(outPath.toUri(), conf); PartitionerJob job = new PartitionerJob(); if (!fs.exists(outPath)) { job.run(conf, A.getRowPath(), outPath, A.numRows(), partitions); } else {/*w w w.jav a 2s .co m*/ log.warn("----------- Skip already exists: " + outPath); } DistributedRowMatrix distRes = new DistributedRowMatrix(outPath, A.getOutputTempPath(), A.numRows(), A.numCols()); distRes.setConf(conf); return distRes; }
From source file:com.twitter.algebra.matrix.text.Matrix2TextJob.java
License:Apache License
public static DistributedRowMatrix run(Configuration conf, DistributedRowMatrix A, String label) throws IOException, InterruptedException, ClassNotFoundException { log.info("running " + Matrix2TextJob.class.getName()); Path outPath = new Path(A.getOutputTempPath(), label); FileSystem fs = FileSystem.get(outPath.toUri(), conf); Matrix2TextJob job = new Matrix2TextJob(); if (!fs.exists(outPath)) { job.run(conf, A.getRowPath(), outPath); } else {// w ww. j ava 2 s .c om log.warn("----------- Skip already exists: " + outPath); } DistributedRowMatrix distRes = new DistributedRowMatrix(outPath, A.getOutputTempPath(), A.numRows(), A.numCols()); distRes.setConf(conf); return distRes; }
From source file:com.twitter.algebra.nmf.ColPartitionJob.java
License:Apache License
/** * Partition A by columns. Refer to {@link ColPartitionJob} for further * details./*from w w w .ja v a 2s. c o m*/ * * @param distM input matrix A * @param conf the initial configuration * @param label the label for the output directory * @param numColPartitions the hint for the desired number of column * partitions * @return Partitioned A wrapped in a DistributedRowMatrix object * @throws IOException * @throws InterruptedException * @throws ClassNotFoundException */ public static DistributedRowMatrix partition(DistributedRowMatrix distM, Configuration conf, String label, int numColPartitions) throws IOException, InterruptedException, ClassNotFoundException { Path outputPath = new Path(distM.getOutputTempPath(), label); FileSystem fs = FileSystem.get(outputPath.toUri(), conf); if (!fs.exists(outputPath)) { ColPartitionJob job = new ColPartitionJob(); job.run(conf, distM.getRowPath(), outputPath, distM.numRows(), distM.numCols(), numColPartitions); } else { log.warn("----------- Skip already exists: " + outputPath); } DistributedRowMatrix m = new DistributedRowMatrix(outputPath, distM.getOutputTempPath(), distM.numRows(), distM.numCols()); m.setConf(conf); return m; }
From source file:com.twitter.algebra.nmf.CombinerJob.java
License:Apache License
public static DistributedRowMatrix run(Configuration conf, DistributedRowMatrix A, String label) throws IOException, InterruptedException, ClassNotFoundException { log.info("running " + CombinerJob.class.getName()); Path outPath = new Path(A.getOutputTempPath(), label); FileSystem fs = FileSystem.get(outPath.toUri(), conf); CombinerJob job = new CombinerJob(); if (!fs.exists(outPath)) { job.run(conf, A.getRowPath(), outPath, A.numRows()); } else {//from w w w .j av a 2s . co m log.warn("----------- Skip already exists: " + outPath); } DistributedRowMatrix distRes = new DistributedRowMatrix(outPath, A.getOutputTempPath(), A.numRows(), A.numCols()); distRes.setConf(conf); return distRes; }