List of usage examples for org.apache.mahout.math AbstractVector AbstractVector
protected AbstractVector(int size)
From source file:edu.dfci.cccb.mev.kmeans.domain.hadoop.HadoopKMeansBuilder.java
License:Open Source License
@Override public KMeans build() throws DatasetException { try (TemporaryFolder hadoop = new TemporaryFolder()) { File points = new File(hadoop, "points"); points.mkdir();//from w w w . j a v a 2s .c om Configuration configuration = new Configuration(); FileSystem system = get(configuration); final Dimension other = dataset().dimension(dimension().type() == ROW ? COLUMN : ROW); List<NamedVector> vectors = new AbstractList<NamedVector>() { @Override public NamedVector get(int index) { final String vector = dimension().keys().get(index); return new NamedVector(new AbstractVector(other.keys().size()) { @Override public void setQuick(int index, double value) { throw new UnsupportedOperationException(); } @Override public Vector like() { return new RandomAccessSparseVector(size()); } @Override public Iterator<Element> iterator() { return new Iterator<Element>() { private int current = 0; @Override public boolean hasNext() { return current < other.keys().size(); } @Override public Element next() { return new Element() { private final int index = current++; @Override public void set(double value) { throw new UnsupportedOperationException(); } @Override public int index() { return index; } @Override @SneakyThrows(InvalidCoordinateException.class) public double get() { return dimension().type() == ROW ? dataset().values().get(vector, other.keys().get(index)) : dataset().values().get(other.keys().get(index), vector); } }; } @Override public void remove() { throw new UnsupportedOperationException(); } }; } @Override public Iterator<Element> iterateNonZero() { return iterator(); } @Override public boolean isSequentialAccess() { return true; } @Override public boolean isDense() { return true; } @Override @SneakyThrows(InvalidCoordinateException.class) public double getQuick(int index) { return dimension().type() == ROW ? dataset().values().get(vector, other.keys().get(index)) : dataset().values().get(other.keys().get(index), vector); } @Override public int getNumNondefaultElements() { return other.keys().size(); } @Override protected Matrix matrixLike(int rows, int columns) { throw new UnsupportedOperationException(); } }, vector); } @Override public int size() { return dimension().keys().size(); } }; // write input try (Writer writer = new Writer(system, configuration, new Path(new File(points, "file1").getAbsolutePath()), LongWritable.class, VectorWritable.class)) { VectorWritable writable = new VectorWritable(); long record = 0; for (Vector vector : vectors) { writable.set(vector); writer.append(new LongWritable(record++), writable); } } // prepare clusters File clusters = new File(hadoop, "clusters"); clusters.mkdir(); try (Writer writer = new Writer(system, configuration, new Path(new File(clusters, "part-00000").getAbsolutePath()), Text.class, Cluster.class)) { for (int i = 0; i < k(); i++) { Vector vec = vectors.get(i); Cluster cluster = new Cluster(vec, i, new EuclideanDistanceMeasure()); writer.append(new Text(cluster.getIdentifier()), cluster); } } File output = new File(hadoop, "output"); output.mkdir(); try { run(configuration, new Path(points.getAbsolutePath()), new Path(clusters.getAbsolutePath()), new Path(output.getAbsolutePath()), metric.measurer(), convergence(), iterations(), true, false); try (Reader reader = new Reader(system, new Path( new File(new File(output, CLUSTERED_POINTS_DIR), "/part-m-00000").getAbsolutePath()), configuration)) { IntWritable key = new IntWritable(); WeightedVectorWritable value = new WeightedVectorWritable(); Map<String, Set<String>> result = new HashMap<>(); while (reader.next(key, value)) { Set<String> cluster = result.get(key.toString()); if (cluster == null) result.put(key.toString(), cluster = new HashSet<>()); cluster.add(((NamedVector) value.getVector()).getName()); } return new AbstractKMeans() { }.dataset(dataset()).dimension(dimension()).name(name()).type(type()) .clusters(new HashSet<>(result.values())); } } catch (ClassNotFoundException | InterruptedException e) { throw new DatasetException(e); } } catch (IOException e) { throw new DatasetException(e); } }