List of usage examples for org.apache.hadoop.fs Path toUri
public URI toUri()
From source file:com.twitter.algebra.nmf.CombinerJob.java
License:Apache License
public void run(Configuration conf, Path matrixInputPath, Path matrixOutputPath, int aRows) throws IOException, InterruptedException, ClassNotFoundException { conf = new Configuration(conf); // conf.setBoolean("mapreduce.output.compress", true); // conf.setBoolean("mapreduce.output.fileoutputformat.compress", true); // conf.set("mapreduce.output.fileoutputformat.compress.codec", "com.hadoop.compression.lzo.LzoCodec"); conf.setInt("dfs.replication", 20); @SuppressWarnings("deprecation") Job job = new Job(conf); job.setJarByClass(CombinerJob.class); job.setJobName(CombinerJob.class.getSimpleName() + "-" + matrixOutputPath.getName()); FileSystem fs = FileSystem.get(matrixInputPath.toUri(), conf); matrixInputPath = fs.makeQualified(matrixInputPath); matrixOutputPath = fs.makeQualified(matrixOutputPath); FileInputFormat.addInputPath(job, matrixInputPath); job.setInputFormatClass(SequenceFileInputFormat.class); FileOutputFormat.setOutputPath(job, matrixOutputPath); int numReducers = NMFCommon.getNumberOfReduceSlots(conf, "combiner"); job.setNumReduceTasks(numReducers);// TODO: make it a parameter job.setOutputFormatClass(MatrixOutputFormat.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(VectorWritable.class); job.setMapperClass(IdMapper.class); job.setReducerClass(MergeVectorsReducer.class); RowPartitioner.setPartitioner(job, RowPartitioner.IntRowPartitioner.class, aRows); job.submit();//from w w w .j av a 2 s.c om boolean res = job.waitForCompletion(true); if (!res) throw new IOException("Job failed!"); }
From source file:com.twitter.algebra.nmf.CompositeDMJ.java
License:Apache License
/** * Refer to {@link CompositeDMJ} for further details. *///from ww w.j a va2 s. co m public static DistributedRowMatrix run(Configuration conf, DistributedRowMatrix A, DistributedRowMatrix B, DistributedRowMatrix inMemC, String label, float alpha1, float alpha2) throws IOException, InterruptedException, ClassNotFoundException { log.info("running " + CompositeDMJ.class.getName()); if (A.numRows() != B.numRows()) { throw new CardinalityException(A.numRows(), B.numRows()); } if (A.numCols() != B.numCols()) { throw new CardinalityException(A.numCols(), B.numCols()); } if (A.numCols() != inMemC.numRows()) { throw new CardinalityException(A.numCols(), inMemC.numRows()); } if (inMemC.numCols() != inMemC.numRows()) { throw new CardinalityException(inMemC.numCols(), inMemC.numRows()); } Path outPath = new Path(A.getOutputTempPath(), label); FileSystem fs = FileSystem.get(outPath.toUri(), conf); CompositeDMJ job = new CompositeDMJ(); if (!fs.exists(outPath)) { job.run(conf, A.getRowPath(), B.getRowPath(), outPath, A.numRows(), inMemC.getRowPath(), inMemC.numRows(), inMemC.numCols(), alpha1, alpha2); } else { log.warn("----------- Skip already exists: " + outPath); } DistributedRowMatrix distRes = new DistributedRowMatrix(outPath, A.getOutputTempPath(), A.numRows(), A.numCols()); distRes.setConf(conf); return distRes; }
From source file:com.twitter.algebra.nmf.CompositeDMJ.java
License:Apache License
public void run(Configuration conf, Path aPath, Path bPath, Path matrixOutputPath, int atCols, String inMemCStr, int inMemCRows, int inMemCCols, float alpha1, float alpha2) throws IOException, InterruptedException, ClassNotFoundException { FileSystem fs = FileSystem.get(aPath.toUri(), conf); long atSize = MapDir.du(aPath, fs); long bSize = MapDir.du(bPath, fs); log.info("Choosing the smaller matrix: atSize: " + atSize + " bSize: " + bSize); boolean aIsMapDir = atSize < bSize; CompositeDMJ job = new CompositeDMJ(); Job hjob;/*from www . j av a 2 s . c o m*/ if (aIsMapDir) hjob = job.run(conf, aPath, bPath, matrixOutputPath, atCols, aIsMapDir, inMemCStr, inMemCRows, inMemCCols, alpha1, alpha2); else hjob = job.run(conf, bPath, aPath, matrixOutputPath, atCols, aIsMapDir, inMemCStr, inMemCRows, inMemCCols, alpha1, alpha2); boolean res = hjob.waitForCompletion(true); if (!res) throw new IOException("Job failed! "); }
From source file:com.twitter.algebra.nmf.CompositeDMJ.java
License:Apache License
public Job run(Configuration conf, Path mapDirPath, Path matrixInputPaths, Path matrixOutputPath, int atCols, boolean aIsMapDir, String inMemCStr, int inMemCRows, int inMemCCols, float alpha1, float alpha2) throws IOException, InterruptedException, ClassNotFoundException { conf = new Configuration(conf); conf.set(MATRIXINMEMORY, inMemCStr); conf.setInt(MATRIXINMEMORYROWS, inMemCRows); conf.setInt(MATRIXINMEMORYCOLS, inMemCCols); conf.setFloat(ALPHA1, alpha1);//from w w w. j a v a2 s .co m conf.setFloat(ALPHA2, alpha2); FileSystem fs = FileSystem.get(matrixOutputPath.toUri(), conf); NMFCommon.setNumberOfMapSlots(conf, fs, matrixInputPaths, "compositedmj"); conf.set(MAPDIRMATRIX, mapDirPath.toString()); conf.setBoolean(AISMAPDIR, aIsMapDir); @SuppressWarnings("deprecation") Job job = new Job(conf); job.setJarByClass(CompositeDMJ.class); job.setJobName(CompositeDMJ.class.getSimpleName() + "-" + matrixOutputPath.getName()); matrixOutputPath = fs.makeQualified(matrixOutputPath); matrixInputPaths = fs.makeQualified(matrixInputPaths); MultipleInputs.addInputPath(job, matrixInputPaths, SequenceFileInputFormat.class); FileOutputFormat.setOutputPath(job, matrixOutputPath); job.setMapperClass(MyMapper.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(VectorWritable.class); // ensures total order (when used with {@link MatrixOutputFormat}), RowPartitioner.setPartitioner(job, RowPartitioner.IntRowPartitioner.class, atCols); job.setNumReduceTasks(0); job.setOutputFormatClass(MatrixOutputFormat.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(VectorWritable.class); job.submit(); return job; }
From source file:com.twitter.algebra.nmf.DistRndMatrixJob.java
License:Apache License
public static DistributedRowMatrix random(Configuration conf, int rows, int cols, Path tmpPath, String label) throws IOException, InterruptedException, ClassNotFoundException { Path inputPath = new Path(tmpPath, "null-in"); Path outputPath = new Path(tmpPath, "Random-" + label + "-" + rows + "-" + cols); FileSystem fs = FileSystem.get(outputPath.toUri(), conf); DistRndMatrixJob job = new DistRndMatrixJob(); if (!fs.exists(inputPath)) { FSDataOutputStream inFile = fs.create(inputPath); inFile.write("NullValue".getBytes()); inFile.close();//from ww w. j a va2 s . c om } if (!fs.exists(outputPath)) { job.run(conf, inputPath, outputPath, rows, cols); } else { log.warn("----------- Skip already exists: " + outputPath); } DistributedRowMatrix distRes = new DistributedRowMatrix(outputPath, tmpPath, rows, cols); distRes.setConf(conf); return distRes; }
From source file:com.twitter.algebra.nmf.DistRndMatrixJob.java
License:Apache License
public void run(Configuration conf, Path inPath, Path matrixOutputPath, int numInputRows, int numInputCols) throws IOException, InterruptedException, ClassNotFoundException { conf = new Configuration(conf); conf.setInt(ROWS, numInputRows);//from w w w. ja va 2s .co m conf.setInt(COLS, numInputCols); @SuppressWarnings("deprecation") Job job = new Job(conf); job.setJarByClass(DistRndMatrixJob.class); job.setJobName(DistRndMatrixJob.class.getSimpleName() + "-" + matrixOutputPath.getName()); FileSystem fs = FileSystem.get(inPath.toUri(), conf); inPath = fs.makeQualified(inPath); matrixOutputPath = fs.makeQualified(matrixOutputPath); FileInputFormat.addInputPath(job, inPath); job.setInputFormatClass(TextInputFormat.class); FileOutputFormat.setOutputPath(job, matrixOutputPath); job.setMapperClass(MyMapper.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(NullWritable.class); // ensures total order (when used with {@link MatrixOutputFormat}), RowPartitioner.setPartitioner(job, RowPartitioner.IntRowPartitioner.class, numInputRows); int numReducers = NMFCommon.getNumberOfReduceSlots(conf, "random"); job.setNumReduceTasks(numReducers); job.setReducerClass(MyReducer.class); job.setOutputFormatClass(MatrixOutputFormat.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(VectorWritable.class); job.submit(); boolean res = job.waitForCompletion(true); if (!res) throw new IOException("Job failed!"); }
From source file:com.twitter.algebra.nmf.Edge2MapDirJob.java
License:Apache License
public static void format(Configuration conf, Path input, Path tmpPath, String label, String name) throws IOException, InterruptedException, ClassNotFoundException { int totalIndex = ReindexerJob.index(conf, input, tmpPath, ReindexerJob.getName(name)); Path indexPath = new Path(tmpPath, ReindexerJob.getName(name)); // TODO: here we assume that input matrix is square Path outputPath = new Path(tmpPath, label); FileSystem fs = FileSystem.get(outputPath.toUri(), conf); Edge2MapDirJob job = new Edge2MapDirJob(); if (!fs.exists(outputPath)) { job.run(conf, input, outputPath, totalIndex, totalIndex, indexPath.toString()); } else {//from w w w . j a v a2s.c o m log.warn("----------- Skip already exists: " + outputPath); } }
From source file:com.twitter.algebra.nmf.Edge2MapDirJob.java
License:Apache License
public void run(Configuration conf, Path matrixInputPath, Path matrixOutputPath, int numInputRows, int numInputCols, String name) throws IOException, InterruptedException, ClassNotFoundException { conf = new Configuration(conf); conf.set(INDEXNAME, name);/*from www . ja v a 2 s. co m*/ conf.setInt(ROWS, numInputRows); conf.setInt(COLS, numInputCols); conf.set("mapreduce.input.keyvaluelinerecordreader.key.value.separator", "\t"); FileSystem fs = FileSystem.get(matrixInputPath.toUri(), conf); NMFCommon.setNumberOfMapSlots(conf, fs, matrixInputPath, "edge2matrix"); @SuppressWarnings("deprecation") Job job = new Job(conf); job.setJarByClass(Edge2MapDirJob.class); job.setJobName(Edge2MapDirJob.class.getSimpleName() + "-" + matrixOutputPath.getName()); matrixInputPath = fs.makeQualified(matrixInputPath); matrixOutputPath = fs.makeQualified(matrixOutputPath); FileInputFormat.addInputPath(job, matrixInputPath); job.setInputFormatClass(KeyValueTextInputFormat.class); FileOutputFormat.setOutputPath(job, matrixOutputPath); job.setMapperClass(MyMapper.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(VectorWritable.class); int numReducers = NMFCommon.getNumberOfReduceSlots(conf, "edge2matrix"); job.setNumReduceTasks(numReducers); // ensures total order (when used with {@link MatrixOutputFormat}), RowPartitioner.setPartitioner(job, RowPartitioner.IntRowPartitioner.class, numInputRows); job.setCombinerClass(MergeVectorsCombiner.class); job.setReducerClass(MergeVectorsReducer.class); job.setOutputFormatClass(MatrixOutputFormat.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(VectorWritable.class); job.submit(); boolean res = job.waitForCompletion(true); if (!res) throw new IOException("Job failed!"); }
From source file:com.twitter.algebra.nmf.ErrDMJ.java
License:Apache License
public static long run(Configuration conf, DistributedRowMatrix X, Vector xColSumVec, DistributedRowMatrix A, DistributedRowMatrix Yt, String label) throws IOException, InterruptedException, ClassNotFoundException { log.info("running " + ErrDMJ.class.getName()); if (X.numRows() != A.numRows()) { throw new CardinalityException(A.numRows(), A.numRows()); }/*from ww w . j a va2 s. com*/ if (A.numCols() != Yt.numCols()) { throw new CardinalityException(A.numCols(), Yt.numCols()); } if (X.numCols() != Yt.numRows()) { throw new CardinalityException(X.numCols(), Yt.numRows()); } Path outPath = new Path(A.getOutputTempPath(), label); FileSystem fs = FileSystem.get(outPath.toUri(), conf); ErrDMJ job = new ErrDMJ(); long totalErr = -1; if (!fs.exists(outPath)) { Job hJob = job.run(conf, X.getRowPath(), A.getRowPath(), Yt.getRowPath(), outPath, A.numRows(), Yt.numRows(), Yt.numCols()); Counters counters = hJob.getCounters(); counters.findCounter("Result", "sumAbs").getValue(); log.info("FINAL ERR is " + totalErr); } else { log.warn("----------- Skip already exists: " + outPath); } Vector sumErrVec = AlgebraCommon.mapDirToSparseVector(outPath, 1, X.numCols(), conf); double maxColErr = Double.MIN_VALUE; double sumColErr = 0; int cntColErr = 0; Iterator<Vector.Element> it = sumErrVec.nonZeroes().iterator(); while (it.hasNext()) { Vector.Element el = it.next(); double errP2 = el.get(); double origP2 = xColSumVec.get(el.index()); double colErr = Math.sqrt(errP2 / origP2); log.info("col: " + el.index() + " sum(err^2): " + errP2 + " sum(val^2): " + origP2 + " colErr: " + colErr); maxColErr = Math.max(colErr, maxColErr); sumColErr += colErr; cntColErr++; } log.info(" Max Col Err: " + maxColErr); log.info(" Avg Col Err: " + sumColErr / cntColErr); return totalErr; }
From source file:com.twitter.algebra.nmf.ErrDMJ.java
License:Apache License
public Job run(Configuration conf, Path xPath, Path matrixAInputPath, Path ytPath, Path outPath, int aRows, int ytRows, int ytCols) throws IOException, InterruptedException, ClassNotFoundException { conf = new Configuration(conf); conf.set(MAPDIRMATRIXX, xPath.toString()); conf.set(MAPDIRMATRIXYT, ytPath.toString()); conf.setInt(YTROWS, ytRows);//from w w w .ja va 2 s . co m conf.setInt(YTCOLS, ytCols); FileSystem fs = FileSystem.get(outPath.toUri(), conf); NMFCommon.setNumberOfMapSlots(conf, fs, matrixAInputPath, "err"); @SuppressWarnings("deprecation") Job job = new Job(conf); job.setJarByClass(ErrDMJ.class); job.setJobName(ErrDMJ.class.getSimpleName() + "-" + outPath.getName()); matrixAInputPath = fs.makeQualified(matrixAInputPath); MultipleInputs.addInputPath(job, matrixAInputPath, SequenceFileInputFormat.class); outPath = fs.makeQualified(outPath); FileOutputFormat.setOutputPath(job, outPath); job.setMapperClass(MyMapper.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(VectorWritable.class); int numReducers = 1; job.setNumReduceTasks(numReducers); job.setCombinerClass(SumVectorsReducer.class); job.setReducerClass(SumVectorsReducer.class); job.setOutputFormatClass(MatrixOutputFormat.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(VectorWritable.class); job.submit(); boolean res = job.waitForCompletion(true); if (!res) throw new IOException("Job failed! "); return job; }