Example usage for org.apache.mahout.math.hadoop DistributedRowMatrix getOutputTempPath

List of usage examples for org.apache.mahout.math.hadoop DistributedRowMatrix getOutputTempPath

Introduction

In this page you can find the example usage for org.apache.mahout.math.hadoop DistributedRowMatrix getOutputTempPath.

Prototype

public Path getOutputTempPath() 

Source Link

Usage

From source file:com.twitter.algebra.matrix.format.Sequence2MatrixFormatJob.java

License:Apache License

public static DistributedRowMatrix run(Configuration conf, DistributedRowMatrix A, String label)
        throws IOException, InterruptedException, ClassNotFoundException {
    log.info("running " + Sequence2MatrixFormatJob.class.getName());

    Path outPath = new Path(A.getOutputTempPath(), label);
    FileSystem fs = FileSystem.get(outPath.toUri(), conf);
    Sequence2MatrixFormatJob job = new Sequence2MatrixFormatJob();
    if (!fs.exists(outPath)) {
        job.run(conf, A.getRowPath(), outPath);
    } else {//from  w  w  w .j a  v a  2 s  . c o  m
        log.warn("----------- Skip already exists: " + outPath);
    }
    DistributedRowMatrix distRes = new DistributedRowMatrix(outPath, A.getOutputTempPath(), A.numRows(),
            A.numCols());
    distRes.setConf(conf);
    return distRes;
}

From source file:com.twitter.algebra.matrix.multiply.ABInnerHDFSBroadcastOfB.java

License:Apache License

/**
 * Perform A x B, where A and B are already wrapped in a DistributedRowMatrix
 * object. Refer to {@link ABInnerHDFSBroadcastOfB} for further details.
 * // ww  w  .  j ava2 s.  c  om
 * @param conf the initial configuration
 * @param A matrix A
 * @param B matrix B
 * @param label the label for the output directory
 * @return AxB wrapped in a DistributedRowMatrix object
 * @throws IOException
 * @throws InterruptedException
 * @throws ClassNotFoundException
 */
public static DistributedRowMatrix run(Configuration conf, DistributedRowMatrix A, DistributedRowMatrix B,
        String label) throws IOException, InterruptedException, ClassNotFoundException {
    log.info("running " + ABInnerHDFSBroadcastOfB.class.getName());
    if (A.numCols() != B.numRows()) {
        throw new CardinalityException(A.numCols(), B.numRows());
    }
    Path outPath = new Path(A.getOutputTempPath(), label);
    FileSystem fs = FileSystem.get(outPath.toUri(), conf);
    ABInnerHDFSBroadcastOfB job = new ABInnerHDFSBroadcastOfB();

    if (!fs.exists(outPath)) {
        job.run(conf, A.getRowPath(), B.getRowPath(), outPath, B.numRows(), B.numCols());
    } else {
        log.warn("----------- Skip already exists: " + outPath);
    }
    DistributedRowMatrix distRes = new DistributedRowMatrix(outPath, A.getOutputTempPath(), A.numRows(),
            B.numCols());
    distRes.setConf(conf);
    return distRes;
}

From source file:com.twitter.algebra.matrix.multiply.ABOuterHDFSBroadcastOfA.java

License:Apache License

/**
 * Perform A x B, where A and B are already wrapped in a DistributedRowMatrix
 * object. Refer to {@link ABOuterHDFSBroadcastOfA} for further details.
 * /* w  ww. j  av a  2  s.  c om*/
 * @param conf
 *          the initial configuration
 * @param A
 *          matrix A
 * @param B
 *          matrix B
 * @param label
 *          the label for the output directory
 * @return AxB wrapped in a DistributedRowMatrix object
 * @throws IOException
 * @throws InterruptedException
 * @throws ClassNotFoundException
 */
public static DistributedRowMatrix run(Configuration conf, DistributedRowMatrix A, DistributedRowMatrix B,
        String label) throws IOException, InterruptedException, ClassNotFoundException {
    log.info("running " + ABOuterHDFSBroadcastOfA.class.getName());
    if (A.numCols() != B.numRows()) {
        throw new CardinalityException(A.numCols(), B.numRows());
    }
    Path outPath = new Path(A.getOutputTempPath(), label);
    FileSystem fs = FileSystem.get(outPath.toUri(), conf);
    ABOuterHDFSBroadcastOfA job = new ABOuterHDFSBroadcastOfA();
    if (!fs.exists(outPath)) {
        job.run(conf, A.getRowPath(), B.getRowPath(), outPath, A.numRows(), A.numCols());
    } else {
        log.warn("----------- Skip already exists: " + outPath);
    }
    DistributedRowMatrix distRes = new DistributedRowMatrix(outPath, A.getOutputTempPath(), A.numRows(),
            B.numCols());
    distRes.setConf(conf);
    return distRes;
}

From source file:com.twitter.algebra.matrix.multiply.AtBOuterStaticMapsideJoinJob.java

License:Apache License

public static DistributedRowMatrix run(Configuration conf, DistributedRowMatrix A, DistributedRowMatrix B,
        String label) throws IOException, InterruptedException, ClassNotFoundException {
    log.info("running " + AtBOuterStaticMapsideJoinJob.class.getName());
    if (A.numRows() != B.numRows()) {
        throw new CardinalityException(A.numRows(), B.numRows());
    }//from   www. j a  v  a 2 s  . c o  m
    Path outPath = new Path(A.getOutputTempPath(), label);
    FileSystem fs = FileSystem.get(outPath.toUri(), conf);
    AtBOuterStaticMapsideJoinJob job = new AtBOuterStaticMapsideJoinJob();
    if (!fs.exists(outPath)) {
        job.run(conf, A.getRowPath(), B.getRowPath(), outPath, B.numCols());
    } else {
        log.warn("----------- Skip already exists: " + outPath);
    }
    DistributedRowMatrix distRes = new DistributedRowMatrix(outPath, A.getOutputTempPath(), A.numCols(),
            B.numCols());
    distRes.setConf(conf);
    return distRes;
}

From source file:com.twitter.algebra.matrix.multiply.AtB_DMJ.java

License:Apache License

/**
 * Perform A x B, where At and B are already wrapped in a DistributedRowMatrix
 * object. Refer to {@link AtB_DMJ} for further details.
 * /*from  www  . j  av  a2s  .com*/
 * Automatically decide on partitioning the larger matrix to be used with
 * in-memory combiners.
 * 
 * @param conf the initial configuration
 * @param At transpose of matrix A
 * @param B matrix B
 * @param label the label for the output directory
 * @param labelAtCol by using a fixed label for AtCol one can avoid the second
 *          run of the partitioning job if we know that At is not changed
 * @param lableBCol by using a fixed label for BCol one can avoid the second
 *          run of the partitioning job if we know that B is not changed
 * @return AxB wrapped in a DistributedRowMatrix object
 * @throws IOException
 * @throws InterruptedException
 * @throws ClassNotFoundException
 */
public static DistributedRowMatrix smartRun(Configuration conf, DistributedRowMatrix At, DistributedRowMatrix B,
        String label, String labelAtCol, String lableBCol)
        throws IOException, InterruptedException, ClassNotFoundException {
    log.info("running " + AtB_DMJ.class.getName());
    if (At.numRows() != B.numRows())
        throw new CardinalityException(At.numRows(), B.numRows());
    Path outPath = new Path(At.getOutputTempPath(), label);
    FileSystem fs = FileSystem.get(outPath.toUri(), conf);
    AtB_DMJ job = new AtB_DMJ();
    if (!fs.exists(outPath)) {
        int numColPartitionsAt = 1, numColPartitionsB = 1;
        int numColPartitions = NMFCommon.computeOptColPartitionsForMemCombiner(conf, At.numCols(), B.numCols());
        long atSize = MapDir.du(At.getRowPath(), fs);
        long bSize = MapDir.du(B.getRowPath(), fs);
        //cost is size of remote reads. For each col partition we need to read the entire of the other matrix once
        long atPartitionCost = numColPartitions * bSize;
        long bPartitionCost = numColPartitions * atSize;
        log.info("smart partitioning: numColPartitions: " + numColPartitions + " atSize: " + atSize + " bSize: "
                + bSize + " atCost=" + atPartitionCost + " vs.  bCost=" + bPartitionCost);
        if (atPartitionCost < bPartitionCost) {
            At = ColPartitionJob.partition(At, conf, labelAtCol, numColPartitions);
            numColPartitionsAt = numColPartitions;
        } else {
            B = ColPartitionJob.partition(B, conf, lableBCol, numColPartitions);
            numColPartitionsB = numColPartitions;
        }
        job.run(conf, At.getRowPath(), B.getRowPath(), outPath, At.numCols(), B.numCols(), numColPartitionsAt,
                numColPartitionsB, true);
    } else {
        log.warn("----------- Skip already exists: " + outPath);
    }
    DistributedRowMatrix distRes = new DistributedRowMatrix(outPath, At.getOutputTempPath(), At.numCols(),
            B.numCols());
    distRes.setConf(conf);
    return distRes;
}

From source file:com.twitter.algebra.matrix.multiply.AtB_DMJ.java

License:Apache License

/**
 * Perform A x B, where At and B are already wrapped in a DistributedRowMatrix
 * object. Refer to {@link AtB_DMJ} for further details.
 * // w w  w .  j  a  v  a  2  s . co  m
 * @param conf the initial configuration
 * @param At transpose of matrix A
 * @param B matrix B
 * @param numColPartitionsAt 
 * @param numColPartitionsB 
 * @param label the label for the output directory
 * @param useInMemCombiner
 * @return AxB wrapped in a DistributedRowMatrix object
 * @throws IOException
 * @throws InterruptedException
 * @throws ClassNotFoundException
 */
public static DistributedRowMatrix run(Configuration conf, DistributedRowMatrix At, DistributedRowMatrix B,
        int numColPartitionsAt, int numColPartitionsB, String label, boolean useInMemCombiner)
        throws IOException, InterruptedException, ClassNotFoundException {
    log.info("running " + AtB_DMJ.class.getName());
    if (At.numRows() != B.numRows())
        throw new CardinalityException(At.numRows(), B.numRows());
    if (numColPartitionsAt != 1 && numColPartitionsB != 1)
        throw new IOException("AtB_DMJ: not both At and B can be column partitioned!");
    Path outPath = new Path(At.getOutputTempPath(), label);
    FileSystem fs = FileSystem.get(outPath.toUri(), conf);
    AtB_DMJ job = new AtB_DMJ();
    if (!fs.exists(outPath)) {
        job.run(conf, At.getRowPath(), B.getRowPath(), outPath, At.numCols(), B.numCols(), numColPartitionsAt,
                numColPartitionsB, useInMemCombiner);
    } else {
        log.warn("----------- Skip already exists: " + outPath);
    }
    DistributedRowMatrix distRes = new DistributedRowMatrix(outPath, At.getOutputTempPath(), At.numCols(),
            B.numCols());
    distRes.setConf(conf);
    return distRes;
}

From source file:com.twitter.algebra.matrix.multiply.PartitionerJob.java

License:Apache License

public static DistributedRowMatrix run(Configuration conf, DistributedRowMatrix A, int partitions, String label)
        throws IOException, InterruptedException, ClassNotFoundException {
    log.info("running " + PartitionerJob.class.getName());
    Path outPath = new Path(A.getOutputTempPath(), label);
    FileSystem fs = FileSystem.get(outPath.toUri(), conf);
    PartitionerJob job = new PartitionerJob();
    if (!fs.exists(outPath)) {
        job.run(conf, A.getRowPath(), outPath, A.numRows(), partitions);
    } else {/*w w  w.jav  a 2s .co m*/
        log.warn("----------- Skip already exists: " + outPath);
    }
    DistributedRowMatrix distRes = new DistributedRowMatrix(outPath, A.getOutputTempPath(), A.numRows(),
            A.numCols());
    distRes.setConf(conf);
    return distRes;
}

From source file:com.twitter.algebra.matrix.text.Matrix2TextJob.java

License:Apache License

public static DistributedRowMatrix run(Configuration conf, DistributedRowMatrix A, String label)
        throws IOException, InterruptedException, ClassNotFoundException {
    log.info("running " + Matrix2TextJob.class.getName());

    Path outPath = new Path(A.getOutputTempPath(), label);
    FileSystem fs = FileSystem.get(outPath.toUri(), conf);
    Matrix2TextJob job = new Matrix2TextJob();
    if (!fs.exists(outPath)) {
        job.run(conf, A.getRowPath(), outPath);
    } else {//  w  ww. j ava  2 s  .c  om
        log.warn("----------- Skip already exists: " + outPath);
    }
    DistributedRowMatrix distRes = new DistributedRowMatrix(outPath, A.getOutputTempPath(), A.numRows(),
            A.numCols());
    distRes.setConf(conf);
    return distRes;
}

From source file:com.twitter.algebra.nmf.ColPartitionJob.java

License:Apache License

/**
 * Partition A by columns. Refer to {@link ColPartitionJob} for further
 * details./*from w  w  w .ja  v a 2s.  c o  m*/
 * 
 * @param distM input matrix A
 * @param conf the initial configuration
 * @param label the label for the output directory
 * @param numColPartitions the hint for the desired number of column
 *          partitions
 * @return Partitioned A wrapped in a DistributedRowMatrix object
 * @throws IOException
 * @throws InterruptedException
 * @throws ClassNotFoundException
 */
public static DistributedRowMatrix partition(DistributedRowMatrix distM, Configuration conf, String label,
        int numColPartitions) throws IOException, InterruptedException, ClassNotFoundException {
    Path outputPath = new Path(distM.getOutputTempPath(), label);
    FileSystem fs = FileSystem.get(outputPath.toUri(), conf);
    if (!fs.exists(outputPath)) {
        ColPartitionJob job = new ColPartitionJob();
        job.run(conf, distM.getRowPath(), outputPath, distM.numRows(), distM.numCols(), numColPartitions);
    } else {
        log.warn("----------- Skip already exists: " + outputPath);
    }
    DistributedRowMatrix m = new DistributedRowMatrix(outputPath, distM.getOutputTempPath(), distM.numRows(),
            distM.numCols());
    m.setConf(conf);
    return m;
}

From source file:com.twitter.algebra.nmf.CombinerJob.java

License:Apache License

public static DistributedRowMatrix run(Configuration conf, DistributedRowMatrix A, String label)
        throws IOException, InterruptedException, ClassNotFoundException {
    log.info("running " + CombinerJob.class.getName());
    Path outPath = new Path(A.getOutputTempPath(), label);
    FileSystem fs = FileSystem.get(outPath.toUri(), conf);
    CombinerJob job = new CombinerJob();
    if (!fs.exists(outPath)) {
        job.run(conf, A.getRowPath(), outPath, A.numRows());
    } else {//from w  w w  .j av  a  2s  .  co  m
        log.warn("----------- Skip already exists: " + outPath);
    }
    DistributedRowMatrix distRes = new DistributedRowMatrix(outPath, A.getOutputTempPath(), A.numRows(),
            A.numCols());
    distRes.setConf(conf);
    return distRes;
}