List of usage examples for org.apache.hadoop.fs Path toUri
public URI toUri()
From source file:com.twitter.algebra.AlgebraCommon.java
License:Apache License
/** * Write a vector to filesystem so that it can be used by distributed jobs * @param vector/*from w ww .j a v a 2 s . co m*/ * @param outputDir * @param label the unique label that be used in naming the vector file * @param conf * @return * @throws IOException */ public static Path toDistributedVector(Vector vector, Path outputDir, String label, Configuration conf) throws IOException { Path outputFile = new Path(outputDir, "Vector-" + label); FileSystem fs = FileSystem.get(outputDir.toUri(), conf); if (fs.exists(outputFile)) { log.warn("----------- OVERWRITE " + outputFile + " already exists"); fs.delete(outputFile, false); } @SuppressWarnings("deprecation") SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, outputFile, IntWritable.class, VectorWritable.class); VectorWritable vectorw = new VectorWritable(); vectorw.set(vector); writer.append(new IntWritable(0), vectorw); writer.close(); return outputFile; }
From source file:com.twitter.algebra.matrix.format.Sequence2MatrixFormatJob.java
License:Apache License
public static DistributedRowMatrix run(Configuration conf, DistributedRowMatrix A, String label) throws IOException, InterruptedException, ClassNotFoundException { log.info("running " + Sequence2MatrixFormatJob.class.getName()); Path outPath = new Path(A.getOutputTempPath(), label); FileSystem fs = FileSystem.get(outPath.toUri(), conf); Sequence2MatrixFormatJob job = new Sequence2MatrixFormatJob(); if (!fs.exists(outPath)) { job.run(conf, A.getRowPath(), outPath); } else {/*from w ww.java 2 s .c o m*/ log.warn("----------- Skip already exists: " + outPath); } DistributedRowMatrix distRes = new DistributedRowMatrix(outPath, A.getOutputTempPath(), A.numRows(), A.numCols()); distRes.setConf(conf); return distRes; }
From source file:com.twitter.algebra.matrix.format.Sequence2MatrixFormatJob.java
License:Apache License
public void run(Configuration conf, Path matrixInputPath, Path matrixOutputPath) throws IOException, InterruptedException, ClassNotFoundException { FileSystem fs = FileSystem.get(matrixInputPath.toUri(), conf); NMFCommon.setNumberOfMapSlots(conf, fs, matrixInputPath, "seq2mtx"); @SuppressWarnings("deprecation") Job job = new Job(conf); job.setJarByClass(Sequence2MatrixFormatJob.class); job.setJobName(Sequence2MatrixFormatJob.class.getSimpleName()); matrixInputPath = fs.makeQualified(matrixInputPath); matrixOutputPath = fs.makeQualified(matrixOutputPath); FileInputFormat.addInputPath(job, matrixInputPath); job.setInputFormatClass(SequenceFileInputFormat.class); FileOutputFormat.setOutputPath(job, matrixOutputPath); job.setNumReduceTasks(0);// w w w . ja v a 2 s . co m job.setOutputFormatClass(MatrixOutputFormat.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(VectorWritable.class); job.submit(); boolean res = job.waitForCompletion(true); if (!res) throw new IOException("Job failed!"); }
From source file:com.twitter.algebra.matrix.multiply.ABInnerHDFSBroadcastOfB.java
License:Apache License
/** * Perform A x B, where A and B are already wrapped in a DistributedRowMatrix * object. Refer to {@link ABInnerHDFSBroadcastOfB} for further details. * //from w w w. j a v a2s . com * @param conf the initial configuration * @param A matrix A * @param B matrix B * @param label the label for the output directory * @return AxB wrapped in a DistributedRowMatrix object * @throws IOException * @throws InterruptedException * @throws ClassNotFoundException */ public static DistributedRowMatrix run(Configuration conf, DistributedRowMatrix A, DistributedRowMatrix B, String label) throws IOException, InterruptedException, ClassNotFoundException { log.info("running " + ABInnerHDFSBroadcastOfB.class.getName()); if (A.numCols() != B.numRows()) { throw new CardinalityException(A.numCols(), B.numRows()); } Path outPath = new Path(A.getOutputTempPath(), label); FileSystem fs = FileSystem.get(outPath.toUri(), conf); ABInnerHDFSBroadcastOfB job = new ABInnerHDFSBroadcastOfB(); if (!fs.exists(outPath)) { job.run(conf, A.getRowPath(), B.getRowPath(), outPath, B.numRows(), B.numCols()); } else { log.warn("----------- Skip already exists: " + outPath); } DistributedRowMatrix distRes = new DistributedRowMatrix(outPath, A.getOutputTempPath(), A.numRows(), B.numCols()); distRes.setConf(conf); return distRes; }
From source file:com.twitter.algebra.matrix.multiply.ABInnerHDFSBroadcastOfB.java
License:Apache License
/** * Perform A x B, where A and B refer to the paths that contain matrices in * {@link SequenceFileInputFormat} Refer to {@link ABInnerHDFSBroadcastOfB} * for further details./*from w w w. j a v a 2s . co m*/ * * @param conf the initial configuration * @param matrixInputPath path to matrix A * @param inMemMatrixDir path to matrix B (must be small enough to fit into * memory) * @param matrixOutputPath path to which AxB will be written * @param inMemMatrixNumRows B rows * @param inMemMatrixNumCols B cols * @throws IOException * @throws InterruptedException * @throws ClassNotFoundException */ public void run(Configuration conf, Path matrixInputPath, String inMemMatrixDir, Path matrixOutputPath, int inMemMatrixNumRows, int inMemMatrixNumCols) throws IOException, InterruptedException, ClassNotFoundException { conf = new Configuration(conf); FileSystem fs = FileSystem.get(matrixInputPath.toUri(), conf); NMFCommon.setNumberOfMapSlots(conf, fs, matrixInputPath, "axbinner"); conf.set(MATRIXINMEMORY, inMemMatrixDir); conf.setInt(MATRIXINMEMORYROWS, inMemMatrixNumRows); conf.setInt(MATRIXINMEMORYCOLS, inMemMatrixNumCols); @SuppressWarnings("deprecation") Job job = new Job(conf); job.setJarByClass(ABInnerHDFSBroadcastOfB.class); job.setJobName(ABInnerHDFSBroadcastOfB.class.getSimpleName()); matrixInputPath = fs.makeQualified(matrixInputPath); matrixOutputPath = fs.makeQualified(matrixOutputPath); FileInputFormat.addInputPath(job, matrixInputPath); job.setInputFormatClass(SequenceFileInputFormat.class); FileOutputFormat.setOutputPath(job, matrixOutputPath); job.setMapperClass(MyMapper.class); job.setNumReduceTasks(0); job.setOutputFormatClass(MatrixOutputFormat.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(VectorWritable.class); // since we do not use reducer, to get total order, the map output files has // to be renamed after this function returns: {@link // AlgebraCommon#fixPartitioningProblem} job.submit(); boolean res = job.waitForCompletion(true); if (!res) throw new IOException("Job failed!"); }
From source file:com.twitter.algebra.matrix.multiply.ABOuterHDFSBroadcastOfA.java
License:Apache License
/** * Perform A x B, where A and B are already wrapped in a DistributedRowMatrix * object. Refer to {@link ABOuterHDFSBroadcastOfA} for further details. * /* w w w . j ava2 s .c o m*/ * @param conf * the initial configuration * @param A * matrix A * @param B * matrix B * @param label * the label for the output directory * @return AxB wrapped in a DistributedRowMatrix object * @throws IOException * @throws InterruptedException * @throws ClassNotFoundException */ public static DistributedRowMatrix run(Configuration conf, DistributedRowMatrix A, DistributedRowMatrix B, String label) throws IOException, InterruptedException, ClassNotFoundException { log.info("running " + ABOuterHDFSBroadcastOfA.class.getName()); if (A.numCols() != B.numRows()) { throw new CardinalityException(A.numCols(), B.numRows()); } Path outPath = new Path(A.getOutputTempPath(), label); FileSystem fs = FileSystem.get(outPath.toUri(), conf); ABOuterHDFSBroadcastOfA job = new ABOuterHDFSBroadcastOfA(); if (!fs.exists(outPath)) { job.run(conf, A.getRowPath(), B.getRowPath(), outPath, A.numRows(), A.numCols()); } else { log.warn("----------- Skip already exists: " + outPath); } DistributedRowMatrix distRes = new DistributedRowMatrix(outPath, A.getOutputTempPath(), A.numRows(), B.numCols()); distRes.setConf(conf); return distRes; }
From source file:com.twitter.algebra.matrix.multiply.ABOuterHDFSBroadcastOfA.java
License:Apache License
/** * Perform A x B, where A and B refer to the paths that contain matrices in * {@link SequenceFileInputFormat} Refer to {@link ABOuterHDFSBroadcastOfA} * for further details.//from w w w . j a va 2 s . co m * * @param conf * the initial configuration * @param matrixInputPath * path to matrix A * @param inMemMatrixDir * path to matrix B (must be small enough to fit into memory) * @param matrixOutputPath * path to which AxB will be written * @param inMemMatrixNumRows * B rows * @param inMemMatrixNumCols * B cols * @throws IOException * @throws InterruptedException * @throws ClassNotFoundException */ public void run(Configuration conf, String inMemMatrixDir, Path matrixInputPath, Path matrixOutputPath, int inMemMatrixNumRows, int inMemMatrixNumCols) throws IOException, InterruptedException, ClassNotFoundException { conf.set(MATRIXINMEMORY, inMemMatrixDir); conf.setInt(MATRIXINMEMORYROWS, inMemMatrixNumRows); conf.setInt(MATRIXINMEMORYCOLS, inMemMatrixNumCols); @SuppressWarnings("deprecation") Job job = new Job(conf); job.setJarByClass(ABOuterHDFSBroadcastOfA.class); job.setJobName(ABOuterHDFSBroadcastOfA.class.getSimpleName()); FileSystem fs = FileSystem.get(matrixInputPath.toUri(), conf); matrixInputPath = fs.makeQualified(matrixInputPath); matrixOutputPath = fs.makeQualified(matrixOutputPath); FileInputFormat.addInputPath(job, matrixInputPath); job.setInputFormatClass(SequenceFileInputFormat.class); FileOutputFormat.setOutputPath(job, matrixOutputPath); job.setMapperClass(MyMapper.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(VectorWritable.class); // ensures total order (when used with {@link MatrixOutputFormat}), RowPartitioner.setPartitioner(job, RowPartitioner.IntRowPartitioner.class, inMemMatrixNumRows); job.setCombinerClass(AtBOuterStaticMapsideJoinJob.MyReducer.class); job.setReducerClass(AtBOuterStaticMapsideJoinJob.MyReducer.class); job.setOutputFormatClass(MatrixOutputFormat.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(VectorWritable.class); job.submit(); boolean res = job.waitForCompletion(true); if (!res) throw new IOException("Job failed!"); }
From source file:com.twitter.algebra.matrix.multiply.AtBOuterStaticMapsideJoinJob.java
License:Apache License
public void run(Configuration conf, Path atPath, Path bPath, Path outPath, int outCardinality) throws IOException, InterruptedException, ClassNotFoundException { conf.setInt(OUT_CARD, outCardinality); @SuppressWarnings("deprecation") Job job = new Job(conf); job.setJobName(AtBOuterStaticMapsideJoinJob.class.getSimpleName()); job.setJarByClass(AtBOuterStaticMapsideJoinJob.class); FileSystem fs = FileSystem.get(atPath.toUri(), conf); atPath = fs.makeQualified(atPath);/*from w ww . j av a 2s . c o m*/ bPath = fs.makeQualified(bPath); job.setInputFormatClass(CompositeInputFormat.class); //mapside join expression job.getConfiguration().set(CompositeInputFormat.JOIN_EXPR, CompositeInputFormat.compose("inner", SequenceFileInputFormat.class, atPath, bPath)); job.setOutputFormatClass(MatrixOutputFormat.class); outPath = fs.makeQualified(outPath); FileOutputFormat.setOutputPath(job, outPath); job.setMapperClass(MyMapper.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(VectorWritable.class); job.setCombinerClass(MyReducer.class); int numReducers = conf.getInt("algebra.reduceslots.multiply", 10); job.setNumReduceTasks(numReducers); job.setReducerClass(MyReducer.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(VectorWritable.class); job.submit(); boolean res = job.waitForCompletion(true); if (!res) throw new IOException("Job failed"); }
From source file:com.twitter.algebra.matrix.multiply.AtBOuterStaticMapsideJoinJob.java
License:Apache License
public static DistributedRowMatrix run(Configuration conf, DistributedRowMatrix A, DistributedRowMatrix B, String label) throws IOException, InterruptedException, ClassNotFoundException { log.info("running " + AtBOuterStaticMapsideJoinJob.class.getName()); if (A.numRows() != B.numRows()) { throw new CardinalityException(A.numRows(), B.numRows()); }/* www . j a v a 2s . co m*/ Path outPath = new Path(A.getOutputTempPath(), label); FileSystem fs = FileSystem.get(outPath.toUri(), conf); AtBOuterStaticMapsideJoinJob job = new AtBOuterStaticMapsideJoinJob(); if (!fs.exists(outPath)) { job.run(conf, A.getRowPath(), B.getRowPath(), outPath, B.numCols()); } else { log.warn("----------- Skip already exists: " + outPath); } DistributedRowMatrix distRes = new DistributedRowMatrix(outPath, A.getOutputTempPath(), A.numCols(), B.numCols()); distRes.setConf(conf); return distRes; }
From source file:com.twitter.algebra.matrix.multiply.AtB_DMJ.java
License:Apache License
/** * Perform A x B, where At and B are already wrapped in a DistributedRowMatrix * object. Refer to {@link AtB_DMJ} for further details. * /*from w w w. j av a 2 s. c o m*/ * Automatically decide on partitioning the larger matrix to be used with * in-memory combiners. * * @param conf the initial configuration * @param At transpose of matrix A * @param B matrix B * @param label the label for the output directory * @param labelAtCol by using a fixed label for AtCol one can avoid the second * run of the partitioning job if we know that At is not changed * @param lableBCol by using a fixed label for BCol one can avoid the second * run of the partitioning job if we know that B is not changed * @return AxB wrapped in a DistributedRowMatrix object * @throws IOException * @throws InterruptedException * @throws ClassNotFoundException */ public static DistributedRowMatrix smartRun(Configuration conf, DistributedRowMatrix At, DistributedRowMatrix B, String label, String labelAtCol, String lableBCol) throws IOException, InterruptedException, ClassNotFoundException { log.info("running " + AtB_DMJ.class.getName()); if (At.numRows() != B.numRows()) throw new CardinalityException(At.numRows(), B.numRows()); Path outPath = new Path(At.getOutputTempPath(), label); FileSystem fs = FileSystem.get(outPath.toUri(), conf); AtB_DMJ job = new AtB_DMJ(); if (!fs.exists(outPath)) { int numColPartitionsAt = 1, numColPartitionsB = 1; int numColPartitions = NMFCommon.computeOptColPartitionsForMemCombiner(conf, At.numCols(), B.numCols()); long atSize = MapDir.du(At.getRowPath(), fs); long bSize = MapDir.du(B.getRowPath(), fs); //cost is size of remote reads. For each col partition we need to read the entire of the other matrix once long atPartitionCost = numColPartitions * bSize; long bPartitionCost = numColPartitions * atSize; log.info("smart partitioning: numColPartitions: " + numColPartitions + " atSize: " + atSize + " bSize: " + bSize + " atCost=" + atPartitionCost + " vs. bCost=" + bPartitionCost); if (atPartitionCost < bPartitionCost) { At = ColPartitionJob.partition(At, conf, labelAtCol, numColPartitions); numColPartitionsAt = numColPartitions; } else { B = ColPartitionJob.partition(B, conf, lableBCol, numColPartitions); numColPartitionsB = numColPartitions; } job.run(conf, At.getRowPath(), B.getRowPath(), outPath, At.numCols(), B.numCols(), numColPartitionsAt, numColPartitionsB, true); } else { log.warn("----------- Skip already exists: " + outPath); } DistributedRowMatrix distRes = new DistributedRowMatrix(outPath, At.getOutputTempPath(), At.numCols(), B.numCols()); distRes.setConf(conf); return distRes; }