List of usage examples for org.apache.hadoop.fs Path toUri
public URI toUri()
From source file:com.twitter.algebra.matrix.multiply.AtB_DMJ.java
License:Apache License
/** * Perform A x B, where At and B are already wrapped in a DistributedRowMatrix * object. Refer to {@link AtB_DMJ} for further details. * //www .ja v a 2 s. c o m * @param conf the initial configuration * @param At transpose of matrix A * @param B matrix B * @param numColPartitionsAt * @param numColPartitionsB * @param label the label for the output directory * @param useInMemCombiner * @return AxB wrapped in a DistributedRowMatrix object * @throws IOException * @throws InterruptedException * @throws ClassNotFoundException */ public static DistributedRowMatrix run(Configuration conf, DistributedRowMatrix At, DistributedRowMatrix B, int numColPartitionsAt, int numColPartitionsB, String label, boolean useInMemCombiner) throws IOException, InterruptedException, ClassNotFoundException { log.info("running " + AtB_DMJ.class.getName()); if (At.numRows() != B.numRows()) throw new CardinalityException(At.numRows(), B.numRows()); if (numColPartitionsAt != 1 && numColPartitionsB != 1) throw new IOException("AtB_DMJ: not both At and B can be column partitioned!"); Path outPath = new Path(At.getOutputTempPath(), label); FileSystem fs = FileSystem.get(outPath.toUri(), conf); AtB_DMJ job = new AtB_DMJ(); if (!fs.exists(outPath)) { job.run(conf, At.getRowPath(), B.getRowPath(), outPath, At.numCols(), B.numCols(), numColPartitionsAt, numColPartitionsB, useInMemCombiner); } else { log.warn("----------- Skip already exists: " + outPath); } DistributedRowMatrix distRes = new DistributedRowMatrix(outPath, At.getOutputTempPath(), At.numCols(), B.numCols()); distRes.setConf(conf); return distRes; }
From source file:com.twitter.algebra.matrix.multiply.AtB_DMJ.java
License:Apache License
/** * Perform A x B, where A and B refer to the paths that contain matrices in * {@link SequenceFileInputFormat}. The smaller of At and B must also conform * with {@link MapDir} format. Refer to {@link AtB_DMJ} for further details. * /*from w ww.j a va2 s .c o m*/ * @param conf the initial configuration * @param atPath path to transpose of matrix A. * @param bPath path to matrix B * @param matrixOutputPath path to which AxB will be written * @param atCols number of columns of At (rows of A) * @param bCols * @param numColPartitionsAt * @param numColPartitionsB * @param useInMemCombiner * @throws IOException * @throws InterruptedException * @throws ClassNotFoundException */ public void run(Configuration conf, Path atPath, Path bPath, Path matrixOutputPath, int atCols, int bCols, int numColPartitionsAt, int numColPartitionsB, boolean useInMemCombiner) throws IOException, InterruptedException, ClassNotFoundException { boolean aIsMapDir = true; if (1 == numColPartitionsAt && 1 == numColPartitionsB) {// if we do not use col partitioning FileSystem fs = FileSystem.get(atPath.toUri(), conf); long atSize = MapDir.du(atPath, fs); long bSize = MapDir.du(bPath, fs); log.info("Choosing the smaller matrix: atSize: " + atSize + " bSize: " + bSize); aIsMapDir = atSize < bSize; } else if (numColPartitionsAt != 1) { aIsMapDir = false; } else if (numColPartitionsB != 1) { aIsMapDir = true; } AtB_DMJ job = new AtB_DMJ(); Job hjob; if (aIsMapDir) { int colsPerPartition = ColPartitionJob.getColPartitionSize(bCols, numColPartitionsB); hjob = job.run(conf, atPath, bPath, matrixOutputPath, atCols, bCols, colsPerPartition, aIsMapDir, useInMemCombiner); } else { int colsPerPartition = ColPartitionJob.getColPartitionSize(atCols, numColPartitionsAt); hjob = job.run(conf, bPath, atPath, matrixOutputPath, atCols, bCols, colsPerPartition, aIsMapDir, useInMemCombiner); } boolean res = hjob.waitForCompletion(true); if (!res) throw new IOException("Job failed! "); }
From source file:com.twitter.algebra.matrix.multiply.AtB_DMJ.java
License:Apache License
/** * Perform A x B, where At and B refer to the paths that contain matrices in * {@link SequenceFileInputFormat}. One of At and B must also conform with * {@link MapDir} format. Refer to {@link AtB_DMJ} for further details. * //from w ww.java 2 s.c o m * @param conf the initial configuration * @param mapDirPath path to the matrix in {@link MapDir} format * @param matrixInputPaths the list of paths to matrix input partitions over * which we iterate * @param matrixOutputPath path to which AxB will be written * @param atCols number of columns of At (rows of A) * @param bCols * @param colsPerPartition cols per partition of the input matrix (whether At or B) * @param aIsMapDir is A chosen to be loaded as MapDir * @param useInMemCombiner * @param numberOfJobs the hint for the desired number of parallel jobs * @return the running job * @throws IOException * @throws InterruptedException * @throws ClassNotFoundException */ public Job run(Configuration conf, Path mapDirPath, Path matrixInputPaths, Path matrixOutputPath, int atCols, int bCols, int colsPerPartition, boolean aIsMapDir, boolean useInMemCombiner) throws IOException, InterruptedException, ClassNotFoundException { conf = new Configuration(conf); conf.set(MATRIXINMEMORY, mapDirPath.toString()); conf.setBoolean(AISMAPDIR, aIsMapDir); conf.setBoolean(USEINMEMCOMBINER, useInMemCombiner); conf.setInt(RESULTROWS, atCols); conf.setInt(RESULTCOLS, bCols); conf.setInt(PARTITIONCOLS, colsPerPartition); FileSystem fs = FileSystem.get(matrixOutputPath.toUri(), conf); NMFCommon.setNumberOfMapSlots(conf, fs, matrixInputPaths, "dmj"); if (useInMemCombiner) { Configuration newConf = new Configuration(conf); newConf.set("mapreduce.task.io.sort.mb", "1"); conf = newConf; } @SuppressWarnings("deprecation") Job job = new Job(conf); job.setJarByClass(AtB_DMJ.class); job.setJobName(AtB_DMJ.class.getSimpleName()); matrixOutputPath = fs.makeQualified(matrixOutputPath); matrixInputPaths = fs.makeQualified(matrixInputPaths); MultipleInputs.addInputPath(job, matrixInputPaths, SequenceFileInputFormat.class); FileOutputFormat.setOutputPath(job, matrixOutputPath); job.setMapperClass(MyMapper.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(VectorWritable.class); if (!useInMemCombiner) job.setCombinerClass(AtBOuterStaticMapsideJoinJob.MyReducer.class); int numReducers = NMFCommon.getNumberOfReduceSlots(conf, "dmj"); job.setNumReduceTasks(numReducers); // ensures total order (when used with {@link MatrixOutputFormat}), RowPartitioner.setPartitioner(job, RowPartitioner.IntRowPartitioner.class, atCols); job.setReducerClass(EpsilonReducer.class); job.setOutputFormatClass(MatrixOutputFormat.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(VectorWritable.class); job.submit(); return job; }
From source file:com.twitter.algebra.matrix.multiply.PartitionerJob.java
License:Apache License
public static DistributedRowMatrix run(Configuration conf, DistributedRowMatrix A, int partitions, String label) throws IOException, InterruptedException, ClassNotFoundException { log.info("running " + PartitionerJob.class.getName()); Path outPath = new Path(A.getOutputTempPath(), label); FileSystem fs = FileSystem.get(outPath.toUri(), conf); PartitionerJob job = new PartitionerJob(); if (!fs.exists(outPath)) { job.run(conf, A.getRowPath(), outPath, A.numRows(), partitions); } else {/* ww w . j av a 2 s .c om*/ log.warn("----------- Skip already exists: " + outPath); } DistributedRowMatrix distRes = new DistributedRowMatrix(outPath, A.getOutputTempPath(), A.numRows(), A.numCols()); distRes.setConf(conf); return distRes; }
From source file:com.twitter.algebra.matrix.multiply.PartitionerJob.java
License:Apache License
public void run(Configuration conf, Path matrixInputPath, Path matrixOutputPath, int aRows, int partitions) throws IOException, InterruptedException, ClassNotFoundException { @SuppressWarnings("deprecation") Job job = new Job(conf); job.setJarByClass(PartitionerJob.class); job.setJobName(PartitionerJob.class.getSimpleName() + "-" + matrixOutputPath.getName()); FileSystem fs = FileSystem.get(matrixInputPath.toUri(), conf); matrixInputPath = fs.makeQualified(matrixInputPath); matrixOutputPath = fs.makeQualified(matrixOutputPath); FileInputFormat.addInputPath(job, matrixInputPath); job.setInputFormatClass(SequenceFileInputFormat.class); FileOutputFormat.setOutputPath(job, matrixOutputPath); job.setNumReduceTasks(partitions);/*from w w w. java2 s . com*/ job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(VectorWritable.class); job.setMapperClass(IdMapper.class); job.setReducerClass(IdReducer.class); RowPartitioner.setPartitioner(job, RowPartitioner.IntRowPartitioner.class, aRows); job.submit(); boolean res = job.waitForCompletion(true); if (!res) throw new IOException("Job failed!"); }
From source file:com.twitter.algebra.matrix.text.Matrix2TextJob.java
License:Apache License
public static DistributedRowMatrix run(Configuration conf, DistributedRowMatrix A, String label) throws IOException, InterruptedException, ClassNotFoundException { log.info("running " + Matrix2TextJob.class.getName()); Path outPath = new Path(A.getOutputTempPath(), label); FileSystem fs = FileSystem.get(outPath.toUri(), conf); Matrix2TextJob job = new Matrix2TextJob(); if (!fs.exists(outPath)) { job.run(conf, A.getRowPath(), outPath); } else {/* ww w . j av a 2 s . c om*/ log.warn("----------- Skip already exists: " + outPath); } DistributedRowMatrix distRes = new DistributedRowMatrix(outPath, A.getOutputTempPath(), A.numRows(), A.numCols()); distRes.setConf(conf); return distRes; }
From source file:com.twitter.algebra.matrix.text.Matrix2TextJob.java
License:Apache License
public void run(Configuration conf, Path matrixInputPath, Path matrixOutputPath) throws IOException, InterruptedException, ClassNotFoundException { @SuppressWarnings("deprecation") Job job = new Job(conf); job.setJarByClass(Matrix2TextJob.class); job.setJobName(Matrix2TextJob.class.getSimpleName()); FileSystem fs = FileSystem.get(matrixInputPath.toUri(), conf); matrixInputPath = fs.makeQualified(matrixInputPath); matrixOutputPath = fs.makeQualified(matrixOutputPath); // FileInputFormat.addInputPath(job, matrixInputPath); MultipleInputs.addInputPath(job, matrixInputPath, SequenceFileInputFormat.class); // job.setInputFormatClass(SequenceFileInputFormat.class); TextOutputFormat.setOutputPath(job, matrixOutputPath); job.setNumReduceTasks(0);/*from w ww.ja v a 2s . c o m*/ job.setOutputFormatClass(TextOutputFormat.class); // job.setOutputKeyClass(IntWritable.class); // job.setOutputValueClass(org.apache.hadoop.io.Text); job.setMapperClass(IdMapper.class); job.submit(); boolean res = job.waitForCompletion(true); if (!res) throw new IOException("Job failed!"); }
From source file:com.twitter.algebra.nmf.ColPartitionJob.java
License:Apache License
/** * Partition A by columns. Refer to {@link ColPartitionJob} for further * details./*w w w . j av a2 s . c o m*/ * * @param distM input matrix A * @param conf the initial configuration * @param label the label for the output directory * @param numColPartitions the hint for the desired number of column * partitions * @return Partitioned A wrapped in a DistributedRowMatrix object * @throws IOException * @throws InterruptedException * @throws ClassNotFoundException */ public static DistributedRowMatrix partition(DistributedRowMatrix distM, Configuration conf, String label, int numColPartitions) throws IOException, InterruptedException, ClassNotFoundException { Path outputPath = new Path(distM.getOutputTempPath(), label); FileSystem fs = FileSystem.get(outputPath.toUri(), conf); if (!fs.exists(outputPath)) { ColPartitionJob job = new ColPartitionJob(); job.run(conf, distM.getRowPath(), outputPath, distM.numRows(), distM.numCols(), numColPartitions); } else { log.warn("----------- Skip already exists: " + outputPath); } DistributedRowMatrix m = new DistributedRowMatrix(outputPath, distM.getOutputTempPath(), distM.numRows(), distM.numCols()); m.setConf(conf); return m; }
From source file:com.twitter.algebra.nmf.ColPartitionJob.java
License:Apache License
/** * Partition A on columns, where A refers to the path that contain a matrix in * {@link SequenceFileInputFormat}. Refer to {@link ColPartitionJob} for * further details.//w ww .jav a 2s . c om * * @param conf the initial configuration * @param matrixInputPath the path to the input matrix A * @param matrixOutputPath the path of the resulting partitioned matrix * @param numInputRows rows * @param numInputCols cols * @param numColPartitions the hint for the desired number of column * partitions * @return the running job * @throws IOException * @throws InterruptedException * @throws ClassNotFoundException */ public Job run(Configuration conf, Path matrixInputPath, Path matrixOutputPath, int numInputRows, int numInputCols, int numColPartitions) throws IOException, InterruptedException, ClassNotFoundException { conf = new Configuration(conf); FileSystem fs = FileSystem.get(matrixOutputPath.toUri(), conf); int numReducers = NMFCommon.getNumberOfReduceSlots(conf, "colpartition"); int colPartSize = getColPartitionSize(numInputCols, numColPartitions); numColPartitions = (int) Math.ceil(numInputCols / (double) colPartSize); if (numReducers < numColPartitions) numReducers = numColPartitions; NMFCommon.setNumberOfMapSlots(conf, fs, matrixInputPath, "colpartition"); conf.setInt(NUM_ORIG_ROWS_KEY, numInputRows); conf.setInt(NUM_ORIG_COLS_KEY, numInputCols); conf.setInt(NUM_COL_PARTITIONS, numColPartitions); @SuppressWarnings("deprecation") Job job = new Job(conf); job.setJarByClass(ColPartitionJob.class); job.setJobName(ColPartitionJob.class.getSimpleName()); matrixOutputPath = fs.makeQualified(matrixOutputPath); MultipleInputs.addInputPath(job, matrixInputPath, SequenceFileInputFormat.class); FileOutputFormat.setOutputPath(job, matrixOutputPath); job.setMapperClass(MyMapper.class); job.setMapOutputKeyClass(ElementWritable.class); job.setMapOutputValueClass(VectorWritable.class); RowColPartitioner.setPartitioner(job, RowColPartitioner.ElementRowColPartitioner.class, numInputRows, numInputCols, numColPartitions); job.setReducerClass(MyReducer.class); job.setNumReduceTasks(numReducers); // job.setOutputFormatClass(SequenceFileOutputFormat.class); LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(VectorWritable.class); job.submit(); boolean res = job.waitForCompletion(true); if (!res) throw new IOException("Job failed!"); return job; }
From source file:com.twitter.algebra.nmf.CombinerJob.java
License:Apache License
public static DistributedRowMatrix run(Configuration conf, DistributedRowMatrix A, String label) throws IOException, InterruptedException, ClassNotFoundException { log.info("running " + CombinerJob.class.getName()); Path outPath = new Path(A.getOutputTempPath(), label); FileSystem fs = FileSystem.get(outPath.toUri(), conf); CombinerJob job = new CombinerJob(); if (!fs.exists(outPath)) { job.run(conf, A.getRowPath(), outPath, A.numRows()); } else {//from w w w .j a va 2 s . c o m log.warn("----------- Skip already exists: " + outPath); } DistributedRowMatrix distRes = new DistributedRowMatrix(outPath, A.getOutputTempPath(), A.numRows(), A.numCols()); distRes.setConf(conf); return distRes; }