Example usage for org.apache.mahout.math.hadoop DistributedRowMatrix numCols

List of usage examples for org.apache.mahout.math.hadoop DistributedRowMatrix numCols

Introduction

In this page you can find the example usage for org.apache.mahout.math.hadoop DistributedRowMatrix numCols.

Prototype

int numCols

To view the source code for org.apache.mahout.math.hadoop DistributedRowMatrix numCols.

Click Source Link

Usage

From source file:com.twitter.algebra.AlgebraCommon.java

License:Apache License

/***
 * If the MapDir matrix is small, we can convert it to an in memory representation
 * and then run efficient centralized operations
 * //from   w  ww. j a v  a  2 s. c  om
 * @param origMtx in MapDir format (generated by MatrixOutputFormat)
 * @return a dense matrix including the data
 * @throws IOException 
 */
public static DenseMatrix toDenseMatrix(DistributedRowMatrix origMtx) throws IOException {
    MapDir mapDir = new MapDir(new Configuration(), origMtx.getRowPath());
    DenseMatrix mtx = new DenseMatrix(origMtx.numRows(), origMtx.numCols());
    Iterator<MatrixSlice> sliceIterator;
    try {
        sliceIterator = mapDir.iterateAll();
    } catch (Exception e) {
        log.info(e.toString());
        log.info("Input is not in matrix format, trying SequenceFileFormat instead ...");
        sliceIterator = origMtx.iterateAll();
    }
    while (sliceIterator.hasNext()) {
        MatrixSlice slice = sliceIterator.next();
        //      int r = slice.index();
        //      for (int c = 0; c < mtx.numCols(); c++) {
        //        mtx.set(r, c, slice.get(c));
        //      }
        mtx.viewRow(slice.index()).assign(slice.vector());
    }
    mapDir.close();
    return mtx;
}

From source file:com.twitter.algebra.AlgebraCommon.java

License:Apache License

/***
 * If the MapDir matrix is small, we can convert it to an in memory representation
 * and then run efficient centralized operations
 * /*from   w ww  .  j  a  va 2s  .c o m*/
 * @param origMtx in MapDir format (generated by MatrixOutputFormat)
 * @return a dense matrix including the data
 * @throws IOException 
 */
static SparseMatrix toSparseMatrix(DistributedRowMatrix origMtx) throws IOException {
    MapDir mapDir = new MapDir(new Configuration(), origMtx.getRowPath());
    SparseMatrix mtx = new SparseMatrix(origMtx.numRows(), origMtx.numCols());
    Iterator<MatrixSlice> sliceIterator = mapDir.iterateAll();
    while (sliceIterator.hasNext()) {
        MatrixSlice slice = sliceIterator.next();
        mtx.viewRow(slice.index()).assign(slice.vector());
    }
    mapDir.close();
    return mtx;
}

From source file:com.twitter.algebra.matrix.format.Sequence2MatrixFormatJob.java

License:Apache License

public static DistributedRowMatrix run(Configuration conf, DistributedRowMatrix A, String label)
        throws IOException, InterruptedException, ClassNotFoundException {
    log.info("running " + Sequence2MatrixFormatJob.class.getName());

    Path outPath = new Path(A.getOutputTempPath(), label);
    FileSystem fs = FileSystem.get(outPath.toUri(), conf);
    Sequence2MatrixFormatJob job = new Sequence2MatrixFormatJob();
    if (!fs.exists(outPath)) {
        job.run(conf, A.getRowPath(), outPath);
    } else {//ww w . jav  a  2  s  .  c o m
        log.warn("----------- Skip already exists: " + outPath);
    }
    DistributedRowMatrix distRes = new DistributedRowMatrix(outPath, A.getOutputTempPath(), A.numRows(),
            A.numCols());
    distRes.setConf(conf);
    return distRes;
}

From source file:com.twitter.algebra.matrix.multiply.ABInnerHDFSBroadcastOfB.java

License:Apache License

/**
 * Perform A x B, where A and B are already wrapped in a DistributedRowMatrix
 * object. Refer to {@link ABInnerHDFSBroadcastOfB} for further details.
 * //from w w  w  .jav a  2s.  c o m
 * @param conf the initial configuration
 * @param A matrix A
 * @param B matrix B
 * @param label the label for the output directory
 * @return AxB wrapped in a DistributedRowMatrix object
 * @throws IOException
 * @throws InterruptedException
 * @throws ClassNotFoundException
 */
public static DistributedRowMatrix run(Configuration conf, DistributedRowMatrix A, DistributedRowMatrix B,
        String label) throws IOException, InterruptedException, ClassNotFoundException {
    log.info("running " + ABInnerHDFSBroadcastOfB.class.getName());
    if (A.numCols() != B.numRows()) {
        throw new CardinalityException(A.numCols(), B.numRows());
    }
    Path outPath = new Path(A.getOutputTempPath(), label);
    FileSystem fs = FileSystem.get(outPath.toUri(), conf);
    ABInnerHDFSBroadcastOfB job = new ABInnerHDFSBroadcastOfB();

    if (!fs.exists(outPath)) {
        job.run(conf, A.getRowPath(), B.getRowPath(), outPath, B.numRows(), B.numCols());
    } else {
        log.warn("----------- Skip already exists: " + outPath);
    }
    DistributedRowMatrix distRes = new DistributedRowMatrix(outPath, A.getOutputTempPath(), A.numRows(),
            B.numCols());
    distRes.setConf(conf);
    return distRes;
}

From source file:com.twitter.algebra.matrix.multiply.ABOuterHDFSBroadcastOfA.java

License:Apache License

/**
 * Perform A x B, where A and B are already wrapped in a DistributedRowMatrix
 * object. Refer to {@link ABOuterHDFSBroadcastOfA} for further details.
 * /*  w  ww  . j  a v a  2 s  .  c om*/
 * @param conf
 *          the initial configuration
 * @param A
 *          matrix A
 * @param B
 *          matrix B
 * @param label
 *          the label for the output directory
 * @return AxB wrapped in a DistributedRowMatrix object
 * @throws IOException
 * @throws InterruptedException
 * @throws ClassNotFoundException
 */
public static DistributedRowMatrix run(Configuration conf, DistributedRowMatrix A, DistributedRowMatrix B,
        String label) throws IOException, InterruptedException, ClassNotFoundException {
    log.info("running " + ABOuterHDFSBroadcastOfA.class.getName());
    if (A.numCols() != B.numRows()) {
        throw new CardinalityException(A.numCols(), B.numRows());
    }
    Path outPath = new Path(A.getOutputTempPath(), label);
    FileSystem fs = FileSystem.get(outPath.toUri(), conf);
    ABOuterHDFSBroadcastOfA job = new ABOuterHDFSBroadcastOfA();
    if (!fs.exists(outPath)) {
        job.run(conf, A.getRowPath(), B.getRowPath(), outPath, A.numRows(), A.numCols());
    } else {
        log.warn("----------- Skip already exists: " + outPath);
    }
    DistributedRowMatrix distRes = new DistributedRowMatrix(outPath, A.getOutputTempPath(), A.numRows(),
            B.numCols());
    distRes.setConf(conf);
    return distRes;
}

From source file:com.twitter.algebra.matrix.multiply.AtBOuterStaticMapsideJoinJob.java

License:Apache License

public static DistributedRowMatrix run(Configuration conf, DistributedRowMatrix A, DistributedRowMatrix B,
        String label) throws IOException, InterruptedException, ClassNotFoundException {
    log.info("running " + AtBOuterStaticMapsideJoinJob.class.getName());
    if (A.numRows() != B.numRows()) {
        throw new CardinalityException(A.numRows(), B.numRows());
    }//  w w w. j  ava2 s. co  m
    Path outPath = new Path(A.getOutputTempPath(), label);
    FileSystem fs = FileSystem.get(outPath.toUri(), conf);
    AtBOuterStaticMapsideJoinJob job = new AtBOuterStaticMapsideJoinJob();
    if (!fs.exists(outPath)) {
        job.run(conf, A.getRowPath(), B.getRowPath(), outPath, B.numCols());
    } else {
        log.warn("----------- Skip already exists: " + outPath);
    }
    DistributedRowMatrix distRes = new DistributedRowMatrix(outPath, A.getOutputTempPath(), A.numCols(),
            B.numCols());
    distRes.setConf(conf);
    return distRes;
}

From source file:com.twitter.algebra.matrix.multiply.AtB_DMJ.java

License:Apache License

/**
 * Perform A x B, where At and B are already wrapped in a DistributedRowMatrix
 * object. Refer to {@link AtB_DMJ} for further details.
 * /*ww w .ja va  2 s. co  m*/
 * Automatically decide on partitioning the larger matrix to be used with
 * in-memory combiners.
 * 
 * @param conf the initial configuration
 * @param At transpose of matrix A
 * @param B matrix B
 * @param label the label for the output directory
 * @param labelAtCol by using a fixed label for AtCol one can avoid the second
 *          run of the partitioning job if we know that At is not changed
 * @param lableBCol by using a fixed label for BCol one can avoid the second
 *          run of the partitioning job if we know that B is not changed
 * @return AxB wrapped in a DistributedRowMatrix object
 * @throws IOException
 * @throws InterruptedException
 * @throws ClassNotFoundException
 */
public static DistributedRowMatrix smartRun(Configuration conf, DistributedRowMatrix At, DistributedRowMatrix B,
        String label, String labelAtCol, String lableBCol)
        throws IOException, InterruptedException, ClassNotFoundException {
    log.info("running " + AtB_DMJ.class.getName());
    if (At.numRows() != B.numRows())
        throw new CardinalityException(At.numRows(), B.numRows());
    Path outPath = new Path(At.getOutputTempPath(), label);
    FileSystem fs = FileSystem.get(outPath.toUri(), conf);
    AtB_DMJ job = new AtB_DMJ();
    if (!fs.exists(outPath)) {
        int numColPartitionsAt = 1, numColPartitionsB = 1;
        int numColPartitions = NMFCommon.computeOptColPartitionsForMemCombiner(conf, At.numCols(), B.numCols());
        long atSize = MapDir.du(At.getRowPath(), fs);
        long bSize = MapDir.du(B.getRowPath(), fs);
        //cost is size of remote reads. For each col partition we need to read the entire of the other matrix once
        long atPartitionCost = numColPartitions * bSize;
        long bPartitionCost = numColPartitions * atSize;
        log.info("smart partitioning: numColPartitions: " + numColPartitions + " atSize: " + atSize + " bSize: "
                + bSize + " atCost=" + atPartitionCost + " vs.  bCost=" + bPartitionCost);
        if (atPartitionCost < bPartitionCost) {
            At = ColPartitionJob.partition(At, conf, labelAtCol, numColPartitions);
            numColPartitionsAt = numColPartitions;
        } else {
            B = ColPartitionJob.partition(B, conf, lableBCol, numColPartitions);
            numColPartitionsB = numColPartitions;
        }
        job.run(conf, At.getRowPath(), B.getRowPath(), outPath, At.numCols(), B.numCols(), numColPartitionsAt,
                numColPartitionsB, true);
    } else {
        log.warn("----------- Skip already exists: " + outPath);
    }
    DistributedRowMatrix distRes = new DistributedRowMatrix(outPath, At.getOutputTempPath(), At.numCols(),
            B.numCols());
    distRes.setConf(conf);
    return distRes;
}

From source file:com.twitter.algebra.matrix.multiply.AtB_DMJ.java

License:Apache License

/**
 * Perform A x B, where At and B are already wrapped in a DistributedRowMatrix
 * object. Refer to {@link AtB_DMJ} for further details.
 * //w ww  . ja va  2 s.  c  o  m
 * @param conf the initial configuration
 * @param At transpose of matrix A
 * @param B matrix B
 * @param numColPartitionsAt 
 * @param numColPartitionsB 
 * @param label the label for the output directory
 * @param useInMemCombiner
 * @return AxB wrapped in a DistributedRowMatrix object
 * @throws IOException
 * @throws InterruptedException
 * @throws ClassNotFoundException
 */
public static DistributedRowMatrix run(Configuration conf, DistributedRowMatrix At, DistributedRowMatrix B,
        int numColPartitionsAt, int numColPartitionsB, String label, boolean useInMemCombiner)
        throws IOException, InterruptedException, ClassNotFoundException {
    log.info("running " + AtB_DMJ.class.getName());
    if (At.numRows() != B.numRows())
        throw new CardinalityException(At.numRows(), B.numRows());
    if (numColPartitionsAt != 1 && numColPartitionsB != 1)
        throw new IOException("AtB_DMJ: not both At and B can be column partitioned!");
    Path outPath = new Path(At.getOutputTempPath(), label);
    FileSystem fs = FileSystem.get(outPath.toUri(), conf);
    AtB_DMJ job = new AtB_DMJ();
    if (!fs.exists(outPath)) {
        job.run(conf, At.getRowPath(), B.getRowPath(), outPath, At.numCols(), B.numCols(), numColPartitionsAt,
                numColPartitionsB, useInMemCombiner);
    } else {
        log.warn("----------- Skip already exists: " + outPath);
    }
    DistributedRowMatrix distRes = new DistributedRowMatrix(outPath, At.getOutputTempPath(), At.numCols(),
            B.numCols());
    distRes.setConf(conf);
    return distRes;
}

From source file:com.twitter.algebra.matrix.multiply.PartitionerJob.java

License:Apache License

public static DistributedRowMatrix run(Configuration conf, DistributedRowMatrix A, int partitions, String label)
        throws IOException, InterruptedException, ClassNotFoundException {
    log.info("running " + PartitionerJob.class.getName());
    Path outPath = new Path(A.getOutputTempPath(), label);
    FileSystem fs = FileSystem.get(outPath.toUri(), conf);
    PartitionerJob job = new PartitionerJob();
    if (!fs.exists(outPath)) {
        job.run(conf, A.getRowPath(), outPath, A.numRows(), partitions);
    } else {/*from  w w w  . j  a v a  2s . c o m*/
        log.warn("----------- Skip already exists: " + outPath);
    }
    DistributedRowMatrix distRes = new DistributedRowMatrix(outPath, A.getOutputTempPath(), A.numRows(),
            A.numCols());
    distRes.setConf(conf);
    return distRes;
}

From source file:com.twitter.algebra.matrix.text.Matrix2TextJob.java

License:Apache License

public static DistributedRowMatrix run(Configuration conf, DistributedRowMatrix A, String label)
        throws IOException, InterruptedException, ClassNotFoundException {
    log.info("running " + Matrix2TextJob.class.getName());

    Path outPath = new Path(A.getOutputTempPath(), label);
    FileSystem fs = FileSystem.get(outPath.toUri(), conf);
    Matrix2TextJob job = new Matrix2TextJob();
    if (!fs.exists(outPath)) {
        job.run(conf, A.getRowPath(), outPath);
    } else {//  w  ww. ja  v  a 2  s.c om
        log.warn("----------- Skip already exists: " + outPath);
    }
    DistributedRowMatrix distRes = new DistributedRowMatrix(outPath, A.getOutputTempPath(), A.numRows(),
            A.numCols());
    distRes.setConf(conf);
    return distRes;
}