Example usage for org.apache.hadoop.fs Path toUri

Introduction

In this page you can find the example usage for org.apache.hadoop.fs Path toUri.

Prototype

public URI toUri()

Source Link

Document

Convert this Path to a URI.

Usage

From source file:com.twitter.algebra.AlgebraCommon.java

License:Apache License

/**
 * Write a vector to filesystem so that it can be used by distributed jobs
 * @param vector/*from w ww .j a v  a 2  s . co m*/
 * @param outputDir
 * @param label the unique label that be used in naming the vector file
 * @param conf
 * @return
 * @throws IOException
 */
public static Path toDistributedVector(Vector vector, Path outputDir, String label, Configuration conf)
        throws IOException {
    Path outputFile = new Path(outputDir, "Vector-" + label);
    FileSystem fs = FileSystem.get(outputDir.toUri(), conf);
    if (fs.exists(outputFile)) {
        log.warn("----------- OVERWRITE " + outputFile + " already exists");
        fs.delete(outputFile, false);
    }
    @SuppressWarnings("deprecation")
    SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, outputFile, IntWritable.class,
            VectorWritable.class);
    VectorWritable vectorw = new VectorWritable();
    vectorw.set(vector);
    writer.append(new IntWritable(0), vectorw);
    writer.close();
    return outputFile;
}

From source file:com.twitter.algebra.matrix.format.Sequence2MatrixFormatJob.java

License:Apache License

public static DistributedRowMatrix run(Configuration conf, DistributedRowMatrix A, String label)
        throws IOException, InterruptedException, ClassNotFoundException {
    log.info("running " + Sequence2MatrixFormatJob.class.getName());

    Path outPath = new Path(A.getOutputTempPath(), label);
    FileSystem fs = FileSystem.get(outPath.toUri(), conf);
    Sequence2MatrixFormatJob job = new Sequence2MatrixFormatJob();
    if (!fs.exists(outPath)) {
        job.run(conf, A.getRowPath(), outPath);
    } else {/*from w  ww.java 2 s .c o  m*/
        log.warn("----------- Skip already exists: " + outPath);
    }
    DistributedRowMatrix distRes = new DistributedRowMatrix(outPath, A.getOutputTempPath(), A.numRows(),
            A.numCols());
    distRes.setConf(conf);
    return distRes;
}

From source file:com.twitter.algebra.matrix.format.Sequence2MatrixFormatJob.java

License:Apache License

public void run(Configuration conf, Path matrixInputPath, Path matrixOutputPath)
        throws IOException, InterruptedException, ClassNotFoundException {
    FileSystem fs = FileSystem.get(matrixInputPath.toUri(), conf);
    NMFCommon.setNumberOfMapSlots(conf, fs, matrixInputPath, "seq2mtx");
    @SuppressWarnings("deprecation")
    Job job = new Job(conf);
    job.setJarByClass(Sequence2MatrixFormatJob.class);
    job.setJobName(Sequence2MatrixFormatJob.class.getSimpleName());

    matrixInputPath = fs.makeQualified(matrixInputPath);
    matrixOutputPath = fs.makeQualified(matrixOutputPath);

    FileInputFormat.addInputPath(job, matrixInputPath);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    FileOutputFormat.setOutputPath(job, matrixOutputPath);
    job.setNumReduceTasks(0);//  w  w w  .  ja  v a 2  s . co m

    job.setOutputFormatClass(MatrixOutputFormat.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(VectorWritable.class);

    job.submit();
    boolean res = job.waitForCompletion(true);
    if (!res)
        throw new IOException("Job failed!");
}

From source file:com.twitter.algebra.matrix.multiply.ABInnerHDFSBroadcastOfB.java

License:Apache License

/**
 * Perform A x B, where A and B are already wrapped in a DistributedRowMatrix
 * object. Refer to {@link ABInnerHDFSBroadcastOfB} for further details.
 * //from w w w. j a v a2s . com
 * @param conf the initial configuration
 * @param A matrix A
 * @param B matrix B
 * @param label the label for the output directory
 * @return AxB wrapped in a DistributedRowMatrix object
 * @throws IOException
 * @throws InterruptedException
 * @throws ClassNotFoundException
 */
public static DistributedRowMatrix run(Configuration conf, DistributedRowMatrix A, DistributedRowMatrix B,
        String label) throws IOException, InterruptedException, ClassNotFoundException {
    log.info("running " + ABInnerHDFSBroadcastOfB.class.getName());
    if (A.numCols() != B.numRows()) {
        throw new CardinalityException(A.numCols(), B.numRows());
    }
    Path outPath = new Path(A.getOutputTempPath(), label);
    FileSystem fs = FileSystem.get(outPath.toUri(), conf);
    ABInnerHDFSBroadcastOfB job = new ABInnerHDFSBroadcastOfB();

    if (!fs.exists(outPath)) {
        job.run(conf, A.getRowPath(), B.getRowPath(), outPath, B.numRows(), B.numCols());
    } else {
        log.warn("----------- Skip already exists: " + outPath);
    }
    DistributedRowMatrix distRes = new DistributedRowMatrix(outPath, A.getOutputTempPath(), A.numRows(),
            B.numCols());
    distRes.setConf(conf);
    return distRes;
}

From source file:com.twitter.algebra.matrix.multiply.ABInnerHDFSBroadcastOfB.java

License:Apache License

/**
 * Perform A x B, where A and B refer to the paths that contain matrices in
 * {@link SequenceFileInputFormat} Refer to {@link ABInnerHDFSBroadcastOfB}
 * for further details./*from   w w  w.  j  a v a 2s .  co  m*/
 * 
 * @param conf the initial configuration
 * @param matrixInputPath path to matrix A
 * @param inMemMatrixDir path to matrix B (must be small enough to fit into
 *          memory)
 * @param matrixOutputPath path to which AxB will be written
 * @param inMemMatrixNumRows B rows
 * @param inMemMatrixNumCols B cols
 * @throws IOException
 * @throws InterruptedException
 * @throws ClassNotFoundException
 */
public void run(Configuration conf, Path matrixInputPath, String inMemMatrixDir, Path matrixOutputPath,
        int inMemMatrixNumRows, int inMemMatrixNumCols)
        throws IOException, InterruptedException, ClassNotFoundException {
    conf = new Configuration(conf);
    FileSystem fs = FileSystem.get(matrixInputPath.toUri(), conf);
    NMFCommon.setNumberOfMapSlots(conf, fs, matrixInputPath, "axbinner");
    conf.set(MATRIXINMEMORY, inMemMatrixDir);
    conf.setInt(MATRIXINMEMORYROWS, inMemMatrixNumRows);
    conf.setInt(MATRIXINMEMORYCOLS, inMemMatrixNumCols);
    @SuppressWarnings("deprecation")
    Job job = new Job(conf);
    job.setJarByClass(ABInnerHDFSBroadcastOfB.class);
    job.setJobName(ABInnerHDFSBroadcastOfB.class.getSimpleName());
    matrixInputPath = fs.makeQualified(matrixInputPath);
    matrixOutputPath = fs.makeQualified(matrixOutputPath);

    FileInputFormat.addInputPath(job, matrixInputPath);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    FileOutputFormat.setOutputPath(job, matrixOutputPath);
    job.setMapperClass(MyMapper.class);

    job.setNumReduceTasks(0);
    job.setOutputFormatClass(MatrixOutputFormat.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(VectorWritable.class);

    // since we do not use reducer, to get total order, the map output files has
    // to be renamed after this function returns: {@link
    // AlgebraCommon#fixPartitioningProblem}
    job.submit();
    boolean res = job.waitForCompletion(true);
    if (!res)
        throw new IOException("Job failed!");
}

From source file:com.twitter.algebra.matrix.multiply.ABOuterHDFSBroadcastOfA.java

License:Apache License

/**
 * Perform A x B, where A and B are already wrapped in a DistributedRowMatrix
 * object. Refer to {@link ABOuterHDFSBroadcastOfA} for further details.
 * /*  w w w  . j  ava2 s .c o  m*/
 * @param conf
 *          the initial configuration
 * @param A
 *          matrix A
 * @param B
 *          matrix B
 * @param label
 *          the label for the output directory
 * @return AxB wrapped in a DistributedRowMatrix object
 * @throws IOException
 * @throws InterruptedException
 * @throws ClassNotFoundException
 */
public static DistributedRowMatrix run(Configuration conf, DistributedRowMatrix A, DistributedRowMatrix B,
        String label) throws IOException, InterruptedException, ClassNotFoundException {
    log.info("running " + ABOuterHDFSBroadcastOfA.class.getName());
    if (A.numCols() != B.numRows()) {
        throw new CardinalityException(A.numCols(), B.numRows());
    }
    Path outPath = new Path(A.getOutputTempPath(), label);
    FileSystem fs = FileSystem.get(outPath.toUri(), conf);
    ABOuterHDFSBroadcastOfA job = new ABOuterHDFSBroadcastOfA();
    if (!fs.exists(outPath)) {
        job.run(conf, A.getRowPath(), B.getRowPath(), outPath, A.numRows(), A.numCols());
    } else {
        log.warn("----------- Skip already exists: " + outPath);
    }
    DistributedRowMatrix distRes = new DistributedRowMatrix(outPath, A.getOutputTempPath(), A.numRows(),
            B.numCols());
    distRes.setConf(conf);
    return distRes;
}

From source file:com.twitter.algebra.matrix.multiply.ABOuterHDFSBroadcastOfA.java

License:Apache License

/**
 * Perform A x B, where A and B refer to the paths that contain matrices in
 * {@link SequenceFileInputFormat} Refer to {@link ABOuterHDFSBroadcastOfA}
 * for further details.//from w w w  .  j  a  va  2 s  .  co m
 * 
 * @param conf
 *          the initial configuration
 * @param matrixInputPath
 *          path to matrix A
 * @param inMemMatrixDir
 *          path to matrix B (must be small enough to fit into memory)
 * @param matrixOutputPath
 *          path to which AxB will be written
 * @param inMemMatrixNumRows
 *          B rows
 * @param inMemMatrixNumCols
 *          B cols
 * @throws IOException
 * @throws InterruptedException
 * @throws ClassNotFoundException
 */
public void run(Configuration conf, String inMemMatrixDir, Path matrixInputPath, Path matrixOutputPath,
        int inMemMatrixNumRows, int inMemMatrixNumCols)
        throws IOException, InterruptedException, ClassNotFoundException {
    conf.set(MATRIXINMEMORY, inMemMatrixDir);
    conf.setInt(MATRIXINMEMORYROWS, inMemMatrixNumRows);
    conf.setInt(MATRIXINMEMORYCOLS, inMemMatrixNumCols);
    @SuppressWarnings("deprecation")
    Job job = new Job(conf);
    job.setJarByClass(ABOuterHDFSBroadcastOfA.class);
    job.setJobName(ABOuterHDFSBroadcastOfA.class.getSimpleName());
    FileSystem fs = FileSystem.get(matrixInputPath.toUri(), conf);
    matrixInputPath = fs.makeQualified(matrixInputPath);
    matrixOutputPath = fs.makeQualified(matrixOutputPath);

    FileInputFormat.addInputPath(job, matrixInputPath);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    FileOutputFormat.setOutputPath(job, matrixOutputPath);
    job.setMapperClass(MyMapper.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(VectorWritable.class);

    // ensures total order (when used with {@link MatrixOutputFormat}),
    RowPartitioner.setPartitioner(job, RowPartitioner.IntRowPartitioner.class, inMemMatrixNumRows);

    job.setCombinerClass(AtBOuterStaticMapsideJoinJob.MyReducer.class);

    job.setReducerClass(AtBOuterStaticMapsideJoinJob.MyReducer.class);
    job.setOutputFormatClass(MatrixOutputFormat.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(VectorWritable.class);

    job.submit();
    boolean res = job.waitForCompletion(true);
    if (!res)
        throw new IOException("Job failed!");
}

From source file:com.twitter.algebra.matrix.multiply.AtBOuterStaticMapsideJoinJob.java

License:Apache License

public void run(Configuration conf, Path atPath, Path bPath, Path outPath, int outCardinality)
        throws IOException, InterruptedException, ClassNotFoundException {
    conf.setInt(OUT_CARD, outCardinality);
    @SuppressWarnings("deprecation")
    Job job = new Job(conf);
    job.setJobName(AtBOuterStaticMapsideJoinJob.class.getSimpleName());
    job.setJarByClass(AtBOuterStaticMapsideJoinJob.class);

    FileSystem fs = FileSystem.get(atPath.toUri(), conf);
    atPath = fs.makeQualified(atPath);/*from w  ww  .  j  av a  2s . c  o  m*/
    bPath = fs.makeQualified(bPath);
    job.setInputFormatClass(CompositeInputFormat.class);
    //mapside join expression
    job.getConfiguration().set(CompositeInputFormat.JOIN_EXPR,
            CompositeInputFormat.compose("inner", SequenceFileInputFormat.class, atPath, bPath));

    job.setOutputFormatClass(MatrixOutputFormat.class);
    outPath = fs.makeQualified(outPath);
    FileOutputFormat.setOutputPath(job, outPath);
    job.setMapperClass(MyMapper.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(VectorWritable.class);

    job.setCombinerClass(MyReducer.class);

    int numReducers = conf.getInt("algebra.reduceslots.multiply", 10);
    job.setNumReduceTasks(numReducers);

    job.setReducerClass(MyReducer.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(VectorWritable.class);
    job.submit();
    boolean res = job.waitForCompletion(true);
    if (!res)
        throw new IOException("Job failed");
}

From source file:com.twitter.algebra.matrix.multiply.AtBOuterStaticMapsideJoinJob.java

License:Apache License

public static DistributedRowMatrix run(Configuration conf, DistributedRowMatrix A, DistributedRowMatrix B,
        String label) throws IOException, InterruptedException, ClassNotFoundException {
    log.info("running " + AtBOuterStaticMapsideJoinJob.class.getName());
    if (A.numRows() != B.numRows()) {
        throw new CardinalityException(A.numRows(), B.numRows());
    }/* www  .  j  a v  a  2s  .  co m*/
    Path outPath = new Path(A.getOutputTempPath(), label);
    FileSystem fs = FileSystem.get(outPath.toUri(), conf);
    AtBOuterStaticMapsideJoinJob job = new AtBOuterStaticMapsideJoinJob();
    if (!fs.exists(outPath)) {
        job.run(conf, A.getRowPath(), B.getRowPath(), outPath, B.numCols());
    } else {
        log.warn("----------- Skip already exists: " + outPath);
    }
    DistributedRowMatrix distRes = new DistributedRowMatrix(outPath, A.getOutputTempPath(), A.numCols(),
            B.numCols());
    distRes.setConf(conf);
    return distRes;
}

From source file:com.twitter.algebra.matrix.multiply.AtB_DMJ.java

License:Apache License

/**
 * Perform A x B, where At and B are already wrapped in a DistributedRowMatrix
 * object. Refer to {@link AtB_DMJ} for further details.
 * /*from   w w  w. j  av  a  2  s. c  o m*/
 * Automatically decide on partitioning the larger matrix to be used with
 * in-memory combiners.
 * 
 * @param conf the initial configuration
 * @param At transpose of matrix A
 * @param B matrix B
 * @param label the label for the output directory
 * @param labelAtCol by using a fixed label for AtCol one can avoid the second
 *          run of the partitioning job if we know that At is not changed
 * @param lableBCol by using a fixed label for BCol one can avoid the second
 *          run of the partitioning job if we know that B is not changed
 * @return AxB wrapped in a DistributedRowMatrix object
 * @throws IOException
 * @throws InterruptedException
 * @throws ClassNotFoundException
 */
public static DistributedRowMatrix smartRun(Configuration conf, DistributedRowMatrix At, DistributedRowMatrix B,
        String label, String labelAtCol, String lableBCol)
        throws IOException, InterruptedException, ClassNotFoundException {
    log.info("running " + AtB_DMJ.class.getName());
    if (At.numRows() != B.numRows())
        throw new CardinalityException(At.numRows(), B.numRows());
    Path outPath = new Path(At.getOutputTempPath(), label);
    FileSystem fs = FileSystem.get(outPath.toUri(), conf);
    AtB_DMJ job = new AtB_DMJ();
    if (!fs.exists(outPath)) {
        int numColPartitionsAt = 1, numColPartitionsB = 1;
        int numColPartitions = NMFCommon.computeOptColPartitionsForMemCombiner(conf, At.numCols(), B.numCols());
        long atSize = MapDir.du(At.getRowPath(), fs);
        long bSize = MapDir.du(B.getRowPath(), fs);
        //cost is size of remote reads. For each col partition we need to read the entire of the other matrix once
        long atPartitionCost = numColPartitions * bSize;
        long bPartitionCost = numColPartitions * atSize;
        log.info("smart partitioning: numColPartitions: " + numColPartitions + " atSize: " + atSize + " bSize: "
                + bSize + " atCost=" + atPartitionCost + " vs.  bCost=" + bPartitionCost);
        if (atPartitionCost < bPartitionCost) {
            At = ColPartitionJob.partition(At, conf, labelAtCol, numColPartitions);
            numColPartitionsAt = numColPartitions;
        } else {
            B = ColPartitionJob.partition(B, conf, lableBCol, numColPartitions);
            numColPartitionsB = numColPartitions;
        }
        job.run(conf, At.getRowPath(), B.getRowPath(), outPath, At.numCols(), B.numCols(), numColPartitionsAt,
                numColPartitionsB, true);
    } else {
        log.warn("----------- Skip already exists: " + outPath);
    }
    DistributedRowMatrix distRes = new DistributedRowMatrix(outPath, At.getOutputTempPath(), At.numCols(),
            B.numCols());
    distRes.setConf(conf);
    return distRes;
}