List of usage examples for org.apache.hadoop.io IntWritable set
public void set(int value)
From source file:org.apache.mahout.common.mapreduce.TransposeMapper.java
License:Apache License
@Override protected void map(IntWritable r, VectorWritable v, Context ctx) throws IOException, InterruptedException { int row = r.get(); for (Vector.Element e : v.get().nonZeroes()) { RandomAccessSparseVector tmp = new RandomAccessSparseVector(newNumCols, 1); tmp.setQuick(row, e.get());/*from ww w . jav a 2 s . co m*/ r.set(e.index()); ctx.write(r, new VectorWritable(tmp)); } }
From source file:org.apache.mahout.math.DistributedRowMatrixWriter.java
License:Apache License
public static void write(Path outputDir, Configuration conf, Iterable<MatrixSlice> matrix) throws IOException { FileSystem fs = outputDir.getFileSystem(conf); SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, outputDir, IntWritable.class, VectorWritable.class); IntWritable topic = new IntWritable(); VectorWritable vector = new VectorWritable(); for (MatrixSlice slice : matrix) { topic.set(slice.index()); vector.set(slice.vector());// w w w.j av a 2 s .c om writer.append(topic, vector); } writer.close(); }
From source file:org.apache.mahout.math.hadoop.decomposer.DistributedLanczosSolver.java
License:Apache License
/** * @param state The final LanczosState to be serialized * @param outputPath The path (relative to the current Configuration's FileSystem) to save the output to. *///from ww w .j a va 2 s . c o m public void serializeOutput(LanczosState state, Path outputPath) throws IOException { int numEigenVectors = state.getIterationNumber(); log.info("Persisting {} eigenVectors and eigenValues to: {}", numEigenVectors, outputPath); Configuration conf = getConf() != null ? getConf() : new Configuration(); FileSystem fs = FileSystem.get(outputPath.toUri(), conf); SequenceFile.Writer seqWriter = new SequenceFile.Writer(fs, conf, outputPath, IntWritable.class, VectorWritable.class); try { IntWritable iw = new IntWritable(); for (int i = 0; i < numEigenVectors; i++) { // Persist eigenvectors sorted by eigenvalues in descending order\ NamedVector v = new NamedVector(state.getRightSingularVector(numEigenVectors - 1 - i), "eigenVector" + i + ", eigenvalue = " + state.getSingularValue(numEigenVectors - 1 - i)); Writable vw = new VectorWritable(v); iw.set(i); seqWriter.append(iw, vw); } } finally { Closeables.close(seqWriter, false); } }
From source file:org.apache.mahout.math.hadoop.decomposer.EigenVerificationJob.java
License:Apache License
private void saveCleanEigens(Configuration conf, Collection<Map.Entry<MatrixSlice, EigenStatus>> prunedEigenMeta) throws IOException { Path path = new Path(outPath, CLEAN_EIGENVECTORS); FileSystem fs = FileSystem.get(path.toUri(), conf); SequenceFile.Writer seqWriter = new SequenceFile.Writer(fs, conf, path, IntWritable.class, VectorWritable.class); try {//w w w .ja va 2s . com IntWritable iw = new IntWritable(); int numEigensWritten = 0; int index = 0; for (Map.Entry<MatrixSlice, EigenStatus> pruneSlice : prunedEigenMeta) { MatrixSlice s = pruneSlice.getKey(); EigenStatus meta = pruneSlice.getValue(); EigenVector ev = new EigenVector(s.vector(), meta.getEigenValue(), Math.abs(1 - meta.getCosAngle()), s.index()); // log.info("appending {} to {}", ev, path); Writable vw = new VectorWritable(ev); iw.set(index++); seqWriter.append(iw, vw); // increment the number of eigenvectors written and see if we've // reached our specified limit, or if we wish to write all eigenvectors // (latter is built-in, since numEigensWritten will always be > 0 numEigensWritten++; if (numEigensWritten == maxEigensToKeep) { log.info("{} of the {} total eigens have been written", maxEigensToKeep, prunedEigenMeta.size()); break; } } } finally { Closeables.close(seqWriter, false); } cleanedEigensPath = path; }
From source file:org.apache.mahout.math.hadoop.stochasticsvd.LocalSSVDPCADenseTest.java
License:Apache License
public void runSSVDSolver(int q) throws IOException { Configuration conf = new Configuration(); conf.set("mapred.job.tracker", "local"); conf.set("fs.default.name", "file:///"); // conf.set("mapred.job.tracker","localhost:11011"); // conf.set("fs.default.name","hdfs://localhost:11010/"); Deque<Closeable> closeables = new LinkedList<Closeable>(); Random rnd = RandomUtils.getRandom(); File tmpDir = getTestTempDir("svdtmp"); conf.set("hadoop.tmp.dir", tmpDir.getAbsolutePath()); Path aLocPath = new Path(getTestTempDirPath("svdtmp/A"), "A.seq"); // create distributed row matrix-like struct SequenceFile.Writer w = SequenceFile.createWriter(FileSystem.getLocal(conf), conf, aLocPath, IntWritable.class, VectorWritable.class, CompressionType.BLOCK, new DefaultCodec()); closeables.addFirst(w);/*from w w w . java 2 s .c om*/ int n = 100; int m = 2000; double percent = 5; VectorWritable vw = new VectorWritable(); IntWritable roww = new IntWritable(); Vector xi = new DenseVector(n); double muAmplitude = 50.0; for (int i = 0; i < m; i++) { Vector dv = new SequentialAccessSparseVector(n); for (int j = 0; j < n * percent / 100; j++) { dv.setQuick(rnd.nextInt(n), muAmplitude * (rnd.nextDouble() - 0.25)); } roww.set(i); vw.set(dv); w.append(roww, vw); xi.assign(dv, Functions.PLUS); } closeables.remove(w); Closeables.close(w, true); xi.assign(Functions.mult(1 / m)); FileSystem fs = FileSystem.get(conf); Path tempDirPath = getTestTempDirPath("svd-proc"); Path aPath = new Path(tempDirPath, "A/A.seq"); fs.copyFromLocalFile(aLocPath, aPath); Path xiPath = new Path(tempDirPath, "xi/xi.seq"); SSVDHelper.saveVector(xi, xiPath, conf); Path svdOutPath = new Path(tempDirPath, "SSVD-out"); // make sure we wipe out previous test results, just a convenience fs.delete(svdOutPath, true); // Solver starts here: System.out.println("Input prepared, starting solver..."); int ablockRows = 867; int p = 60; int k = 40; SSVDSolver ssvd = new SSVDSolver(conf, new Path[] { aPath }, svdOutPath, ablockRows, k, p, 3); ssvd.setOuterBlockHeight(500); ssvd.setAbtBlockHeight(251); ssvd.setPcaMeanPath(xiPath); /* * removing V,U jobs from this test to reduce running time. i will keep them * put in the dense test though. */ ssvd.setComputeU(false); ssvd.setComputeV(false); ssvd.setOverwrite(true); ssvd.setQ(q); ssvd.setBroadcast(true); ssvd.run(); Vector stochasticSValues = ssvd.getSingularValues(); System.out.println("--SSVD solver singular values:"); LocalSSVDSolverSparseSequentialTest.dumpSv(stochasticSValues); System.out.println("--Colt SVD solver singular values:"); // try to run the same thing without stochastic algo double[][] a = SSVDHelper.loadDistributedRowMatrix(fs, aPath, conf); // subtract pseudo pca mean for (int i = 0; i < m; i++) for (int j = 0; j < n; j++) a[i][j] -= xi.getQuick(j); SingularValueDecomposition svd2 = new SingularValueDecomposition(new DenseMatrix(a)); Vector svalues2 = new DenseVector(svd2.getSingularValues()); LocalSSVDSolverSparseSequentialTest.dumpSv(svalues2); for (int i = 0; i < k + p; i++) { assertTrue(Math.abs(svalues2.getQuick(i) - stochasticSValues.getQuick(i)) <= s_epsilon); } double[][] mQ = SSVDHelper.loadDistributedRowMatrix(fs, new Path(svdOutPath, "Bt-job/" + BtJob.OUTPUT_Q + "-*"), conf); SSVDCommonTest.assertOrthonormality(new DenseMatrix(mQ), false, s_epsilon); }
From source file:org.apache.mahout.math.hadoop.stochasticsvd.LocalSSVDSolverSparseSequentialTest.java
License:Apache License
public void runSSVDSolver(int q) throws IOException { Configuration conf = getConfiguration(); conf.set("mapred.job.tracker", "local"); conf.set("fs.default.name", "file:///"); // conf.set("mapred.job.tracker","localhost:11011"); // conf.set("fs.default.name","hdfs://localhost:11010/"); Deque<Closeable> closeables = Lists.newLinkedList(); ;//from w w w .j a v a 2s. c om Random rnd = RandomUtils.getRandom(); File tmpDir = getTestTempDir("svdtmp"); conf.set("hadoop.tmp.dir", tmpDir.getAbsolutePath()); Path aLocPath = new Path(getTestTempDirPath("svdtmp/A"), "A.seq"); // create distributed row matrix-like struct SequenceFile.Writer w = SequenceFile.createWriter(FileSystem.getLocal(conf), conf, aLocPath, IntWritable.class, VectorWritable.class, CompressionType.BLOCK, new DefaultCodec()); closeables.addFirst(w); int n = 100; int m = 2000; double percent = 5; VectorWritable vw = new VectorWritable(); IntWritable roww = new IntWritable(); double muAmplitude = 50.0; for (int i = 0; i < m; i++) { Vector dv = new SequentialAccessSparseVector(n); for (int j = 0; j < n * percent / 100; j++) { dv.setQuick(rnd.nextInt(n), muAmplitude * (rnd.nextDouble() - 0.5)); } roww.set(i); vw.set(dv); w.append(roww, vw); } closeables.remove(w); Closeables.close(w, false); FileSystem fs = FileSystem.get(aLocPath.toUri(), conf); Path tempDirPath = getTestTempDirPath("svd-proc"); Path aPath = new Path(tempDirPath, "A/A.seq"); fs.copyFromLocalFile(aLocPath, aPath); Path svdOutPath = new Path(tempDirPath, "SSVD-out"); // make sure we wipe out previous test results, just a convenience fs.delete(svdOutPath, true); // Solver starts here: System.out.println("Input prepared, starting solver..."); int ablockRows = 867; int p = 60; int k = 40; SSVDSolver ssvd = new SSVDSolver(conf, new Path[] { aPath }, svdOutPath, ablockRows, k, p, 3); ssvd.setOuterBlockHeight(500); ssvd.setAbtBlockHeight(251); /* * removing V,U jobs from this test to reduce running time. i will keep them * put in the dense test though. */ ssvd.setComputeU(false); ssvd.setComputeV(false); ssvd.setOverwrite(true); ssvd.setQ(q); ssvd.setBroadcast(true); ssvd.run(); Vector stochasticSValues = ssvd.getSingularValues(); System.out.println("--SSVD solver singular values:"); dumpSv(stochasticSValues); System.out.println("--Colt SVD solver singular values:"); // try to run the same thing without stochastic algo DenseMatrix a = SSVDHelper.drmLoadAsDense(fs, aPath, conf); // SingularValueDecompositionImpl svd=new SingularValueDecompositionImpl(new // Array2DRowRealMatrix(a)); SingularValueDecomposition svd2 = new SingularValueDecomposition(a); Vector svalues2 = new DenseVector(svd2.getSingularValues()); dumpSv(svalues2); for (int i = 0; i < k + p; i++) { assertTrue(Math.abs(svalues2.getQuick(i) - stochasticSValues.getQuick(i)) <= s_epsilon); } DenseMatrix mQ = SSVDHelper.drmLoadAsDense(fs, new Path(svdOutPath, "Bt-job/" + BtJob.OUTPUT_Q + "-*"), conf); SSVDCommonTest.assertOrthonormality(mQ, false, s_epsilon); IOUtils.close(closeables); }
From source file:org.apache.mahout.math.hadoop.stochasticsvd.SSVDTestsHelper.java
License:Apache License
/** * Generate some randome but meaningful input with singular value ratios of n, * n-1...1//from www .j a v a 2 s .c o m * * @param outputPath */ static void generateDenseInput(Path outputPath, FileSystem dfs, Vector svalues, int m, int n, int startRowKey) throws IOException { Random rnd = RandomUtils.getRandom(); int svCnt = svalues.size(); Matrix v = generateDenseOrthonormalRandom(n, svCnt, rnd); Matrix u = generateDenseOrthonormalRandom(m, svCnt, rnd); // apply singular values Matrix mx = m > n ? v : u; for (int i = 0; i < svCnt; i++) { mx.assignColumn(i, mx.viewColumn(i).times(svalues.getQuick(i))); } SequenceFile.Writer w = SequenceFile.createWriter(dfs, dfs.getConf(), outputPath, IntWritable.class, VectorWritable.class); try { Vector outV = new DenseVector(n); Writable vw = new VectorWritable(outV); IntWritable iw = new IntWritable(); for (int i = 0; i < m; i++) { iw.set(startRowKey + i); for (int j = 0; j < n; j++) { outV.setQuick(j, u.viewRow(i).dot(v.viewRow(j))); } w.append(iw, vw); } } finally { w.close(); } }
From source file:org.apache.mahout.math.hadoop.stochasticsvd.SSVDTestsHelper.java
License:Apache License
public static void main(String[] args) throws Exception { // create 1Gb input for distributed tests. MahoutTestCase ca = new MahoutTestCase(); Configuration conf = ca.getConfiguration(); FileSystem dfs = FileSystem.getLocal(conf); Path outputDir = new Path("/tmp/DRM"); dfs.mkdirs(outputDir);//from w ww. j a v a2 s.c o m // for ( int i = 1; i <= 10; i++ ) { // generateDenseInput(new Path(outputDir,String.format("part-%05d",i)),dfs, // new DenseVector ( new double[] { // 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0.8,0.3,0.1,0.01 // }),1200,10000,(i-1)*1200); // } /* * create 2Gb sparse 4.5 m x 4.5m input . (similar to wikipedia graph). * * In order to get at 2Gb, we need to generate ~ 40 non-zero items per row average. * */ outputDir = new Path("/tmp/DRM-sparse"); Random rnd = RandomUtils.getRandom(); SequenceFile.Writer w = SequenceFile.createWriter(dfs, dfs.getConf(), new Path(outputDir, "sparse.seq"), IntWritable.class, VectorWritable.class); try { IntWritable iw = new IntWritable(); VectorWritable vw = new VectorWritable(); int avgNZero = 40; int n = 4500000; for (int i = 1; i < n; i++) { Vector vector = new RandomAccessSparseVector(n); double nz = Math.round(avgNZero * (rnd.nextGaussian() + 1)); if (nz < 0) { nz = 0; } for (int j = 1; j < nz; j++) { vector.set(rnd.nextInt(n), rnd.nextGaussian() * 25 + 3); } iw.set(i); vw.set(vector); w.append(iw, vw); } } finally { w.close(); } }
From source file:org.apache.mahout.math.MatrixUtils.java
License:Apache License
public static void write(Path outputDir, Configuration conf, VectorIterable matrix) throws IOException { FileSystem fs = outputDir.getFileSystem(conf); fs.delete(outputDir, true);//from w w w. j ava2 s . co m SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, outputDir, IntWritable.class, VectorWritable.class); IntWritable topic = new IntWritable(); VectorWritable vector = new VectorWritable(); for (MatrixSlice slice : matrix) { topic.set(slice.index()); vector.set(slice.vector()); writer.append(topic, vector); } writer.close(); }
From source file:org.apache.mahout.utils.SplitInputTest.java
License:Apache License
/** * Create a Sequencefile for testing consisting of IntWritable * keys and VectorWritable values/*from w w w .j ava 2s . c o m*/ * @param path path for test SequenceFile * @param testPoints number of records in test SequenceFile */ private void writeVectorSequenceFile(Path path, int testPoints) throws IOException { Path tempSequenceFile = new Path(path, "part-00000"); Configuration conf = getConfiguration(); IntWritable key = new IntWritable(); VectorWritable value = new VectorWritable(); SequenceFile.Writer writer = null; try { writer = SequenceFile.createWriter(fs, conf, tempSequenceFile, IntWritable.class, VectorWritable.class); for (int i = 0; i < testPoints; i++) { key.set(i); Vector v = new SequentialAccessSparseVector(4); v.assign(i); value.set(v); writer.append(key, value); } } finally { IOUtils.closeStream(writer); } }