Example usage for org.apache.hadoop.fs Path toUri

List of usage examples for org.apache.hadoop.fs Path toUri

Introduction

In this page you can find the example usage for org.apache.hadoop.fs Path toUri.

Prototype

public URI toUri() 

Source Link

Document

Convert this Path to a URI.

Usage

From source file:com.twitter.algebra.matrix.multiply.AtB_DMJ.java

License:Apache License

/**
 * Perform A x B, where At and B are already wrapped in a DistributedRowMatrix
 * object. Refer to {@link AtB_DMJ} for further details.
 * //www .ja  v  a  2 s.  c o m
 * @param conf the initial configuration
 * @param At transpose of matrix A
 * @param B matrix B
 * @param numColPartitionsAt 
 * @param numColPartitionsB 
 * @param label the label for the output directory
 * @param useInMemCombiner
 * @return AxB wrapped in a DistributedRowMatrix object
 * @throws IOException
 * @throws InterruptedException
 * @throws ClassNotFoundException
 */
public static DistributedRowMatrix run(Configuration conf, DistributedRowMatrix At, DistributedRowMatrix B,
        int numColPartitionsAt, int numColPartitionsB, String label, boolean useInMemCombiner)
        throws IOException, InterruptedException, ClassNotFoundException {
    log.info("running " + AtB_DMJ.class.getName());
    if (At.numRows() != B.numRows())
        throw new CardinalityException(At.numRows(), B.numRows());
    if (numColPartitionsAt != 1 && numColPartitionsB != 1)
        throw new IOException("AtB_DMJ: not both At and B can be column partitioned!");
    Path outPath = new Path(At.getOutputTempPath(), label);
    FileSystem fs = FileSystem.get(outPath.toUri(), conf);
    AtB_DMJ job = new AtB_DMJ();
    if (!fs.exists(outPath)) {
        job.run(conf, At.getRowPath(), B.getRowPath(), outPath, At.numCols(), B.numCols(), numColPartitionsAt,
                numColPartitionsB, useInMemCombiner);
    } else {
        log.warn("----------- Skip already exists: " + outPath);
    }
    DistributedRowMatrix distRes = new DistributedRowMatrix(outPath, At.getOutputTempPath(), At.numCols(),
            B.numCols());
    distRes.setConf(conf);
    return distRes;
}

From source file:com.twitter.algebra.matrix.multiply.AtB_DMJ.java

License:Apache License

/**
 * Perform A x B, where A and B refer to the paths that contain matrices in
 * {@link SequenceFileInputFormat}. The smaller of At and B must also conform
 * with {@link MapDir} format. Refer to {@link AtB_DMJ} for further details.
 * /*from  w  ww.j a  va2  s .c o  m*/
 * @param conf the initial configuration
 * @param atPath path to transpose of matrix A.
 * @param bPath path to matrix B
 * @param matrixOutputPath path to which AxB will be written
 * @param atCols number of columns of At (rows of A)
 * @param bCols
 * @param numColPartitionsAt
 * @param numColPartitionsB 
 * @param useInMemCombiner
 * @throws IOException
 * @throws InterruptedException
 * @throws ClassNotFoundException
 */
public void run(Configuration conf, Path atPath, Path bPath, Path matrixOutputPath, int atCols, int bCols,
        int numColPartitionsAt, int numColPartitionsB, boolean useInMemCombiner)
        throws IOException, InterruptedException, ClassNotFoundException {
    boolean aIsMapDir = true;
    if (1 == numColPartitionsAt && 1 == numColPartitionsB) {// if we do not use col partitioning
        FileSystem fs = FileSystem.get(atPath.toUri(), conf);
        long atSize = MapDir.du(atPath, fs);
        long bSize = MapDir.du(bPath, fs);
        log.info("Choosing the smaller matrix: atSize: " + atSize + " bSize: " + bSize);
        aIsMapDir = atSize < bSize;
    } else if (numColPartitionsAt != 1) {
        aIsMapDir = false;
    } else if (numColPartitionsB != 1) {
        aIsMapDir = true;
    }
    AtB_DMJ job = new AtB_DMJ();
    Job hjob;
    if (aIsMapDir) {
        int colsPerPartition = ColPartitionJob.getColPartitionSize(bCols, numColPartitionsB);
        hjob = job.run(conf, atPath, bPath, matrixOutputPath, atCols, bCols, colsPerPartition, aIsMapDir,
                useInMemCombiner);
    } else {
        int colsPerPartition = ColPartitionJob.getColPartitionSize(atCols, numColPartitionsAt);
        hjob = job.run(conf, bPath, atPath, matrixOutputPath, atCols, bCols, colsPerPartition, aIsMapDir,
                useInMemCombiner);
    }
    boolean res = hjob.waitForCompletion(true);
    if (!res)
        throw new IOException("Job failed! ");
}

From source file:com.twitter.algebra.matrix.multiply.AtB_DMJ.java

License:Apache License

/**
 * Perform A x B, where At and B refer to the paths that contain matrices in
 * {@link SequenceFileInputFormat}. One of At and B must also conform with
 * {@link MapDir} format. Refer to {@link AtB_DMJ} for further details.
 * //from  w ww.java  2  s.c  o m
 * @param conf the initial configuration
 * @param mapDirPath path to the matrix in {@link MapDir} format
 * @param matrixInputPaths the list of paths to matrix input partitions over
 *          which we iterate
 * @param matrixOutputPath path to which AxB will be written
 * @param atCols number of columns of At (rows of A)
 * @param bCols
 * @param colsPerPartition cols per partition of the input matrix (whether At or B)
 * @param aIsMapDir is A chosen to be loaded as MapDir
 * @param useInMemCombiner
 * @param numberOfJobs the hint for the desired number of parallel jobs
 * @return the running job
 * @throws IOException
 * @throws InterruptedException
 * @throws ClassNotFoundException
 */
public Job run(Configuration conf, Path mapDirPath, Path matrixInputPaths, Path matrixOutputPath, int atCols,
        int bCols, int colsPerPartition, boolean aIsMapDir, boolean useInMemCombiner)
        throws IOException, InterruptedException, ClassNotFoundException {
    conf = new Configuration(conf);

    conf.set(MATRIXINMEMORY, mapDirPath.toString());
    conf.setBoolean(AISMAPDIR, aIsMapDir);
    conf.setBoolean(USEINMEMCOMBINER, useInMemCombiner);
    conf.setInt(RESULTROWS, atCols);
    conf.setInt(RESULTCOLS, bCols);
    conf.setInt(PARTITIONCOLS, colsPerPartition);
    FileSystem fs = FileSystem.get(matrixOutputPath.toUri(), conf);
    NMFCommon.setNumberOfMapSlots(conf, fs, matrixInputPaths, "dmj");

    if (useInMemCombiner) {
        Configuration newConf = new Configuration(conf);
        newConf.set("mapreduce.task.io.sort.mb", "1");
        conf = newConf;
    }

    @SuppressWarnings("deprecation")
    Job job = new Job(conf);
    job.setJarByClass(AtB_DMJ.class);
    job.setJobName(AtB_DMJ.class.getSimpleName());
    matrixOutputPath = fs.makeQualified(matrixOutputPath);

    matrixInputPaths = fs.makeQualified(matrixInputPaths);
    MultipleInputs.addInputPath(job, matrixInputPaths, SequenceFileInputFormat.class);

    FileOutputFormat.setOutputPath(job, matrixOutputPath);
    job.setMapperClass(MyMapper.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(VectorWritable.class);

    if (!useInMemCombiner)
        job.setCombinerClass(AtBOuterStaticMapsideJoinJob.MyReducer.class);

    int numReducers = NMFCommon.getNumberOfReduceSlots(conf, "dmj");
    job.setNumReduceTasks(numReducers);
    // ensures total order (when used with {@link MatrixOutputFormat}),
    RowPartitioner.setPartitioner(job, RowPartitioner.IntRowPartitioner.class, atCols);

    job.setReducerClass(EpsilonReducer.class);
    job.setOutputFormatClass(MatrixOutputFormat.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(VectorWritable.class);
    job.submit();
    return job;
}

From source file:com.twitter.algebra.matrix.multiply.PartitionerJob.java

License:Apache License

public static DistributedRowMatrix run(Configuration conf, DistributedRowMatrix A, int partitions, String label)
        throws IOException, InterruptedException, ClassNotFoundException {
    log.info("running " + PartitionerJob.class.getName());
    Path outPath = new Path(A.getOutputTempPath(), label);
    FileSystem fs = FileSystem.get(outPath.toUri(), conf);
    PartitionerJob job = new PartitionerJob();
    if (!fs.exists(outPath)) {
        job.run(conf, A.getRowPath(), outPath, A.numRows(), partitions);
    } else {/*  ww w . j  av  a  2 s .c  om*/
        log.warn("----------- Skip already exists: " + outPath);
    }
    DistributedRowMatrix distRes = new DistributedRowMatrix(outPath, A.getOutputTempPath(), A.numRows(),
            A.numCols());
    distRes.setConf(conf);
    return distRes;
}

From source file:com.twitter.algebra.matrix.multiply.PartitionerJob.java

License:Apache License

public void run(Configuration conf, Path matrixInputPath, Path matrixOutputPath, int aRows, int partitions)
        throws IOException, InterruptedException, ClassNotFoundException {
    @SuppressWarnings("deprecation")
    Job job = new Job(conf);
    job.setJarByClass(PartitionerJob.class);
    job.setJobName(PartitionerJob.class.getSimpleName() + "-" + matrixOutputPath.getName());
    FileSystem fs = FileSystem.get(matrixInputPath.toUri(), conf);
    matrixInputPath = fs.makeQualified(matrixInputPath);
    matrixOutputPath = fs.makeQualified(matrixOutputPath);

    FileInputFormat.addInputPath(job, matrixInputPath);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    FileOutputFormat.setOutputPath(job, matrixOutputPath);

    job.setNumReduceTasks(partitions);/*from  w  w  w.  java2 s . com*/

    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(VectorWritable.class);

    job.setMapperClass(IdMapper.class);
    job.setReducerClass(IdReducer.class);

    RowPartitioner.setPartitioner(job, RowPartitioner.IntRowPartitioner.class, aRows);

    job.submit();
    boolean res = job.waitForCompletion(true);
    if (!res)
        throw new IOException("Job failed!");
}

From source file:com.twitter.algebra.matrix.text.Matrix2TextJob.java

License:Apache License

public static DistributedRowMatrix run(Configuration conf, DistributedRowMatrix A, String label)
        throws IOException, InterruptedException, ClassNotFoundException {
    log.info("running " + Matrix2TextJob.class.getName());

    Path outPath = new Path(A.getOutputTempPath(), label);
    FileSystem fs = FileSystem.get(outPath.toUri(), conf);
    Matrix2TextJob job = new Matrix2TextJob();
    if (!fs.exists(outPath)) {
        job.run(conf, A.getRowPath(), outPath);
    } else {/*  ww  w . j  av  a  2  s  . c  om*/
        log.warn("----------- Skip already exists: " + outPath);
    }
    DistributedRowMatrix distRes = new DistributedRowMatrix(outPath, A.getOutputTempPath(), A.numRows(),
            A.numCols());
    distRes.setConf(conf);
    return distRes;
}

From source file:com.twitter.algebra.matrix.text.Matrix2TextJob.java

License:Apache License

public void run(Configuration conf, Path matrixInputPath, Path matrixOutputPath)
        throws IOException, InterruptedException, ClassNotFoundException {
    @SuppressWarnings("deprecation")
    Job job = new Job(conf);
    job.setJarByClass(Matrix2TextJob.class);
    job.setJobName(Matrix2TextJob.class.getSimpleName());
    FileSystem fs = FileSystem.get(matrixInputPath.toUri(), conf);
    matrixInputPath = fs.makeQualified(matrixInputPath);
    matrixOutputPath = fs.makeQualified(matrixOutputPath);

    //    FileInputFormat.addInputPath(job, matrixInputPath);
    MultipleInputs.addInputPath(job, matrixInputPath, SequenceFileInputFormat.class);
    //    job.setInputFormatClass(SequenceFileInputFormat.class);
    TextOutputFormat.setOutputPath(job, matrixOutputPath);
    job.setNumReduceTasks(0);/*from  w  ww.ja  v  a 2s  . c o  m*/

    job.setOutputFormatClass(TextOutputFormat.class);
    //    job.setOutputKeyClass(IntWritable.class);
    //    job.setOutputValueClass(org.apache.hadoop.io.Text);
    job.setMapperClass(IdMapper.class);

    job.submit();
    boolean res = job.waitForCompletion(true);
    if (!res)
        throw new IOException("Job failed!");
}

From source file:com.twitter.algebra.nmf.ColPartitionJob.java

License:Apache License

/**
 * Partition A by columns. Refer to {@link ColPartitionJob} for further
 * details./*w w  w . j  av a2 s .  c o  m*/
 * 
 * @param distM input matrix A
 * @param conf the initial configuration
 * @param label the label for the output directory
 * @param numColPartitions the hint for the desired number of column
 *          partitions
 * @return Partitioned A wrapped in a DistributedRowMatrix object
 * @throws IOException
 * @throws InterruptedException
 * @throws ClassNotFoundException
 */
public static DistributedRowMatrix partition(DistributedRowMatrix distM, Configuration conf, String label,
        int numColPartitions) throws IOException, InterruptedException, ClassNotFoundException {
    Path outputPath = new Path(distM.getOutputTempPath(), label);
    FileSystem fs = FileSystem.get(outputPath.toUri(), conf);
    if (!fs.exists(outputPath)) {
        ColPartitionJob job = new ColPartitionJob();
        job.run(conf, distM.getRowPath(), outputPath, distM.numRows(), distM.numCols(), numColPartitions);
    } else {
        log.warn("----------- Skip already exists: " + outputPath);
    }
    DistributedRowMatrix m = new DistributedRowMatrix(outputPath, distM.getOutputTempPath(), distM.numRows(),
            distM.numCols());
    m.setConf(conf);
    return m;
}

From source file:com.twitter.algebra.nmf.ColPartitionJob.java

License:Apache License

/**
 * Partition A on columns, where A refers to the path that contain a matrix in
 * {@link SequenceFileInputFormat}. Refer to {@link ColPartitionJob} for
 * further details.//w ww  .jav a  2s . c  om
 * 
 * @param conf the initial configuration
 * @param matrixInputPath the path to the input matrix A
 * @param matrixOutputPath the path of the resulting partitioned matrix
 * @param numInputRows rows
 * @param numInputCols cols
 * @param numColPartitions the hint for the desired number of column
 *          partitions
 * @return the running job
 * @throws IOException
 * @throws InterruptedException
 * @throws ClassNotFoundException
 */
public Job run(Configuration conf, Path matrixInputPath, Path matrixOutputPath, int numInputRows,
        int numInputCols, int numColPartitions)
        throws IOException, InterruptedException, ClassNotFoundException {
    conf = new Configuration(conf);

    FileSystem fs = FileSystem.get(matrixOutputPath.toUri(), conf);
    int numReducers = NMFCommon.getNumberOfReduceSlots(conf, "colpartition");

    int colPartSize = getColPartitionSize(numInputCols, numColPartitions);
    numColPartitions = (int) Math.ceil(numInputCols / (double) colPartSize);

    if (numReducers < numColPartitions)
        numReducers = numColPartitions;

    NMFCommon.setNumberOfMapSlots(conf, fs, matrixInputPath, "colpartition");

    conf.setInt(NUM_ORIG_ROWS_KEY, numInputRows);
    conf.setInt(NUM_ORIG_COLS_KEY, numInputCols);
    conf.setInt(NUM_COL_PARTITIONS, numColPartitions);
    @SuppressWarnings("deprecation")
    Job job = new Job(conf);
    job.setJarByClass(ColPartitionJob.class);
    job.setJobName(ColPartitionJob.class.getSimpleName());

    matrixOutputPath = fs.makeQualified(matrixOutputPath);

    MultipleInputs.addInputPath(job, matrixInputPath, SequenceFileInputFormat.class);

    FileOutputFormat.setOutputPath(job, matrixOutputPath);
    job.setMapperClass(MyMapper.class);
    job.setMapOutputKeyClass(ElementWritable.class);
    job.setMapOutputValueClass(VectorWritable.class);

    RowColPartitioner.setPartitioner(job, RowColPartitioner.ElementRowColPartitioner.class, numInputRows,
            numInputCols, numColPartitions);

    job.setReducerClass(MyReducer.class);
    job.setNumReduceTasks(numReducers);

    //    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(VectorWritable.class);
    job.submit();
    boolean res = job.waitForCompletion(true);
    if (!res)
        throw new IOException("Job failed!");
    return job;
}

From source file:com.twitter.algebra.nmf.CombinerJob.java

License:Apache License

public static DistributedRowMatrix run(Configuration conf, DistributedRowMatrix A, String label)
        throws IOException, InterruptedException, ClassNotFoundException {
    log.info("running " + CombinerJob.class.getName());
    Path outPath = new Path(A.getOutputTempPath(), label);
    FileSystem fs = FileSystem.get(outPath.toUri(), conf);
    CombinerJob job = new CombinerJob();
    if (!fs.exists(outPath)) {
        job.run(conf, A.getRowPath(), outPath, A.numRows());
    } else {//from w w w .j  a va  2  s . c  o m
        log.warn("----------- Skip already exists: " + outPath);
    }
    DistributedRowMatrix distRes = new DistributedRowMatrix(outPath, A.getOutputTempPath(), A.numRows(),
            A.numCols());
    distRes.setConf(conf);
    return distRes;
}