List of usage examples for org.apache.mahout.common.iterator.sequencefile SequenceFileDirValueIterable SequenceFileDirValueIterable
public SequenceFileDirValueIterable(Path path, PathType pathType, PathFilter filter, Configuration conf)
From source file:DisplayClustering.java
License:Apache License
protected static List<Cluster> readClustersWritable(Path clustersIn) { List<Cluster> clusters = Lists.newArrayList(); Configuration conf = new Configuration(); for (ClusterWritable value : new SequenceFileDirValueIterable<ClusterWritable>(clustersIn, PathType.LIST, PathFilters.logsCRCFilter(), conf)) { Cluster cluster = value.getValue(); log.info("Reading Cluster:{} center:{} numPoints:{} radius:{}", cluster.getId(), AbstractCluster.formatVector(cluster.getCenter(), null), cluster.getNumObservations(), AbstractCluster.formatVector(cluster.getRadius(), null)); clusters.add(cluster);//from w ww . j a va2s . c om } return clusters; }
From source file:com.modofo.molo.cluster.DisplayClustering.java
License:Apache License
protected static List<Cluster> readClustersWritable(Path clustersIn) { List<Cluster> clusters = Lists.newArrayList(); Configuration conf = new Configuration(); for (ClusterWritable value : new SequenceFileDirValueIterable<ClusterWritable>(clustersIn, PathType.LIST, PathFilters.logsCRCFilter(), conf)) { Cluster cluster = value.getValue(); log.info("Reading Cluster:{} center:{} numPoints:{} radius:{}", new Object[] { cluster.getId(), AbstractCluster.formatVector(cluster.getCenter(), null), cluster.getNumObservations(), AbstractCluster.formatVector(cluster.getRadius(), null) }); clusters.add(cluster);/* www . ja va 2 s . c o m*/ } return clusters; }
From source file:edu.indiana.d2i.htrc.kmeans.MemCachedKMeansDriver.java
License:Apache License
private static Path buildClustersSeq(Configuration conf, Path input, Path clustersIn, Path output, DistanceMeasure measure, int maxIterations, String delta) throws IOException { KMeansClusterer clusterer = new KMeansClusterer(measure); Collection<Cluster> clusters = Lists.newArrayList(); MemKMeansUtil.configureWithClusterInfo(conf, clustersIn, clusters); if (clusters.isEmpty()) { throw new IllegalStateException("Clusters is empty!"); }/*from w w w . ja v a 2 s . c o m*/ boolean converged = false; int iteration = 1; while (!converged && iteration <= maxIterations) { log.info("K-Means Iteration: {}", iteration); FileSystem fs = FileSystem.get(input.toUri(), conf); for (VectorWritable value : new SequenceFileDirValueIterable<VectorWritable>(input, PathType.LIST, PathFilters.logsCRCFilter(), conf)) { clusterer.addPointToNearestCluster(value.get(), clusters); } converged = clusterer.testConvergence(clusters, Double.parseDouble(delta)); Path clustersOut = new Path(output, AbstractCluster.CLUSTERS_DIR + iteration); SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, new Path(clustersOut, "part-r-00000"), Text.class, Cluster.class); try { for (Cluster cluster : clusters) { if (log.isDebugEnabled()) { log.debug("Writing Cluster:{} center:{} numPoints:{} radius:{} to: {}", new Object[] { cluster.getId(), AbstractCluster.formatVector(cluster.getCenter(), null), cluster.getNumPoints(), AbstractCluster.formatVector(cluster.getRadius(), null), clustersOut.getName() }); } writer.append(new Text(cluster.getIdentifier()), cluster); } } finally { Closeables.closeQuietly(writer); } clustersIn = clustersOut; iteration++; } Path finalClustersIn = new Path(output, AbstractCluster.CLUSTERS_DIR + (iteration - 1) + org.apache.mahout.clustering.Cluster.FINAL_ITERATION_SUFFIX); FileSystem.get(conf).rename(new Path(output, AbstractCluster.CLUSTERS_DIR + (iteration - 1)), finalClustersIn); return finalClustersIn; }
From source file:edu.indiana.d2i.htrc.kmeans.MemKMeansUtil.java
License:Apache License
/** Configure the mapper with the cluster info */ public static void configureWithClusterInfo(Configuration conf, Path clusterPath, Collection<Cluster> clusters) { for (Writable value : new SequenceFileDirValueIterable<Writable>(clusterPath, PathType.LIST, PathFilters.partFilter(), conf)) { Class<? extends Writable> valueClass = value.getClass(); if (valueClass.equals(Cluster.class)) { // get the cluster info clusters.add((Cluster) value); } else if (valueClass.equals(Canopy.class)) { // get the cluster info Canopy canopy = (Canopy) value; clusters.add(new Cluster(canopy.getCenter(), canopy.getId(), canopy.getMeasure())); } else {//from w w w.j a v a2 s .com throw new IllegalStateException("Bad value class: " + valueClass); } } }
From source file:io.github.thushear.display.DisplayClustering.java
License:Apache License
protected static List<Cluster> readClusters(Path clustersIn) { List<Cluster> clusters = new ArrayList<Cluster>(); Configuration conf = new Configuration(); for (Cluster value : new SequenceFileDirValueIterable<Cluster>(clustersIn, PathType.LIST, PathFilters.logsCRCFilter(), conf)) { log.info("Reading Cluster:{} center:{} numPoints:{} radius:{}", new Object[] { value.getId(), AbstractCluster.formatVector(value.getCenter(), null), value.getNumPoints(), AbstractCluster.formatVector(value.getRadius(), null) }); clusters.add(value);/*from www .ja v a 2 s . c om*/ } return clusters; }
From source file:org.qcri.pca.PCATest.java
License:Apache License
@Test public void crossTestIterationOfMapReducePPCASequentialPPCA() throws Exception { Matrix C_central = PCACommon.randomMatrix(D, d); double ss = PCACommon.randSS(); InitialValues initValSeq = new InitialValues(C_central, ss); InitialValues initValMR = new InitialValues(C_central.clone(), ss); //1. run sequential Matrix Ye_central = new DenseMatrix(N, D); int row = 0;/*from w w w . j a v a 2 s .c o m*/ for (VectorWritable vw : new SequenceFileDirValueIterable<VectorWritable>(input, PathType.LIST, null, conf)) { Ye_central.assignRow(row, vw.get()); row++; } double bishopSeqErr = ppcaDriver.runSequential(conf, Ye_central, initValSeq, 1); //2. run mapreduce DistributedRowMatrix Ye = new DistributedRowMatrix(input, tmp, N, D); Ye.setConf(conf); double bishopMRErr = ppcaDriver.runMapReduce(conf, Ye, initValMR, output, N, D, d, 1, 1, 1, 1); Assert.assertEquals("ss value is different in sequential and mapreduce PCA", initValSeq.ss, initValMR.ss, EPSILON); double seqCTrace = PCACommon.trace(initValSeq.C); double mrCTrace = PCACommon.trace(initValMR.C); Assert.assertEquals("C value is different in sequential and mapreduce PCA", seqCTrace, mrCTrace, EPSILON); Assert.assertEquals("The PPCA error between sequntial and mapreduce methods is too different: " + bishopSeqErr + "!= " + bishopMRErr, bishopSeqErr, bishopMRErr, EPSILON); }
From source file:org.qcri.pca.SPCADriver.java
/*** * PPCA: sequential PPCA based on the paper from Tipping and Bishop * //from w ww. j ava 2 s .c o m * @param conf * the configuration * @param input * the path to the input matrix Y * @param output * the output path (not used currently) * @param nRows * number or rows in Y * @param nCols * number of columns in Y * @param nPCs * number of desired principal components * @return the error * @throws Exception */ double runSequential(Configuration conf, Path input, Path output, final int nRows, final int nCols, final int nPCs) throws Exception { Matrix centralY = new DenseMatrix(nRows, nCols); FileSystem fs = FileSystem.get(input.toUri(), conf); if (fs.listStatus(input).length == 0) { System.err.println("No file under " + input); return 0; } int row = 0; for (VectorWritable vw : new SequenceFileDirValueIterable<VectorWritable>(input, PathType.LIST, null, conf)) { centralY.assignRow(row, vw.get()); row++; } Matrix centralC = PCACommon.randomMatrix(nCols, nPCs); double ss = PCACommon.randSS(); InitialValues initVal = new InitialValues(centralC, ss); // Matrix sampledYe = sample(centralY); // runSequential(conf, sampledYe, initVal, 100); double error = runSequential(conf, centralY, initVal, 100); return error; }
From source file:org.qcri.pca.SPCADriver.java
/** * PPCA: sequential PPCA based on the matlab implementation of Jacob Verbeek * /*from w w w . j a v a2 s . c om*/ * @param conf * the configuration * @param input * the path to the input matrix Y * @param output * the output path (not used currently) * @param nRows * number or rows in Y * @param nCols * number of columns in Y * @param nPCs * number of desired principal components * @return the error * @throws Exception */ double runSequential_JacobVersion(Configuration conf, Path input, Path output, final int nRows, final int nCols, final int nPCs) throws Exception { Matrix centralY = new DenseMatrix(nRows, nCols); FileSystem fs = FileSystem.get(input.toUri(), conf); if (fs.listStatus(input).length == 0) { System.err.println("No file under " + input); return 0; } int row = 0; for (VectorWritable vw : new SequenceFileDirValueIterable<VectorWritable>(input, PathType.LIST, null, conf)) { centralY.assignRow(row, vw.get()); row++; } Matrix C = PCACommon.randomMatrix(nCols, nPCs); double ss = PCACommon.randSS(); InitialValues initVal = new InitialValues(C, ss); double error = runSequential_JacobVersion(conf, centralY, initVal, 100); return error; }
From source file:sample.DisplayClustering.java
License:Apache License
protected static List<Cluster> readClustersWritable(Path clustersIn) { List<Cluster> clusters = new ArrayList<>(); Configuration conf = new Configuration(); for (ClusterWritable value : new SequenceFileDirValueIterable<ClusterWritable>(clustersIn, PathType.LIST, PathFilters.logsCRCFilter(), conf)) { Cluster cluster = value.getValue(); System.out.println("Cluster: " + cluster.getId() + " " + AbstractCluster.formatVector(cluster.getCenter(), null) + " " + cluster.getNumObservations() + " " + AbstractCluster.formatVector(cluster.getRadius(), null)); clusters.add(cluster);/*w w w.jav a2 s .co m*/ } return clusters; }