List of usage examples for org.apache.mahout.math NamedVector NamedVector
public NamedVector(Vector delegate, String name)
From source file:com.cloudera.science.ml.core.vectors.Vectors.java
License:Open Source License
/** * Constructs a {@code NamedVector} from the given name and * values.//from w ww. ja v a 2 s .c o m * * @param name The name of the vector * @param v The values it contains * @return A new {@code NamedVector} */ public static Vector named(String name, double... v) { return new NamedVector(of(v), name); }
From source file:com.cloudera.science.ml.parallel.fn.SvmLightFnTest.java
License:Open Source License
@Test public void testNamedVector() throws Exception { Vector v = Vectors.named("foo", 1.0, 2.0, 3.0); assertEquals("foo 0:1.0 1:2.0 2:3.0", fn.map(v)); v = Vectors.sparse(10);//w w w . j a v a2 s.c o m v.set(3, 7.2); v.set(6, 12.0); v = new NamedVector(v, "bar"); assertEquals("bar 3:7.2 6:12.0", fn.map(v)); }
From source file:com.elex.dmp.vectorizer.TFPartialVectorReducer.java
License:Apache License
@Override protected void reduce(Text key, Iterable<StringTuple> values, Context context) throws IOException, InterruptedException { Iterator<StringTuple> it = values.iterator(); if (!it.hasNext()) { return;//from www .j av a 2 s. c o m } StringTuple value = it.next(); Vector vector = new RandomAccessSparseVector(dimension, value.length()); // guess at initial size if (maxNGramSize >= 2) { ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(value.getEntries().iterator()), maxNGramSize); try { do { String term = sf.getAttribute(CharTermAttribute.class).toString(); if (!term.isEmpty() && dictionary.containsKey(term)) { // ngram int termId = dictionary.get(term); vector.setQuick(termId, vector.getQuick(termId) + 1); } } while (sf.incrementToken()); sf.end(); } finally { Closeables.closeQuietly(sf); } } else { for (String term : value.getEntries()) { if (!term.isEmpty() && dictionary.containsKey(term)) { // unigram int termId = dictionary.get(term); vector.setQuick(termId, vector.getQuick(termId) + 1); } } } if (sequentialAccess) { vector = new SequentialAccessSparseVector(vector); } if (namedVector) { vector = new NamedVector(vector, key.toString()); } // if the vector has no nonZero entries (nothing in the dictionary), let's not waste space sending it to disk. if (vector.getNumNondefaultElements() > 0) { VectorWritable vectorWritable = new VectorWritable(vector); context.write(key, vectorWritable); } else { context.getCounter("TFParticalVectorReducer", "emptyVectorCount").increment(1); } }
From source file:com.lakhani.anchorgraph.applestovectors.java
public static void main(String args[]) throws Exception { List<NamedVector> apples = new ArrayList<NamedVector>(); NamedVector apple;/* ww w . j a v a 2 s .com*/ apple = new NamedVector(new DenseVector(new double[] { 0.11, 510, 1 }), "Small round green apple"); apples.add(apple); apple = new NamedVector(new DenseVector(new double[] { 0.23, 650, 3 }), "Large oval red apple"); apples.add(apple); apple = new NamedVector(new DenseVector(new double[] { 0.09, 630, 1 }), "Small elongated red apple"); apples.add(apple); apple = new NamedVector(new DenseVector(new double[] { 0.25, 590, 3 }), "Large round yellow apple"); apples.add(apple); apple = new NamedVector(new DenseVector(new double[] { 0.18, 520, 2 }), "Medium oval green apple"); Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); Path path = new Path("/user/cloudera/anchorgraph/output"); SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, path, Text.class, VectorWritable.class); VectorWritable vec = new VectorWritable(); for (NamedVector vector : apples) { vec.set(vector); writer.append(new Text(vector.getName()), vec); } writer.close(); SequenceFile.Reader reader = new SequenceFile.Reader(fs, new Path("appledata/apples"), conf); Text key = new Text(); VectorWritable value = new VectorWritable(); while (reader.next(key, value)) { System.out.println(key.toString() + " " + value.get().asFormatString()); } reader.close(); }
From source file:com.mozilla.grouperfish.pig.storage.DocumentVectorStorage.java
License:Apache License
@SuppressWarnings("unchecked") @Override/* w ww .jav a2 s. c o m*/ public void putNext(Tuple tuple) throws IOException { outputKey.set((String) tuple.get(0)); Tuple vectorTuple = (Tuple) tuple.get(1); Vector vector = new NamedVector(new RandomAccessSparseVector(dimensions, vectorTuple.size()), outputKey.toString()); for (int i = 0; i < vectorTuple.size(); i++) { Object o = vectorTuple.get(i); switch (vectorTuple.getType(i)) { case DataType.INTEGER: // If this is just an integer then we just want to set the index to 1.0 vector.set((Integer) o, 1.0); break; case DataType.TUPLE: // If this is a tuple then we want to set the index and the weight Tuple subt = (Tuple) o; vector.set((Integer) subt.get(0), (Double) subt.get(1)); break; default: throw new RuntimeException("Unexpected tuple form"); } } outputValue.set(vector); try { writer.write(outputKey, outputValue); } catch (InterruptedException e) { LOG.error("Interrupted while writing", e); } }
From source file:com.mozilla.grouperfish.transforms.coclustering.pig.storage.MahoutVectorStorage.java
License:Apache License
@Override public void putNext(Tuple t) throws IOException { IntWritable outputKey = new IntWritable(); VectorWritable outputValue = new VectorWritable(); outputKey.set((Integer) t.get(0)); Tuple currRow = (Tuple) t.get(1);//from w ww . j a v a2s . c o m Vector currRowVector; if (dimensions == 0) { throw new IllegalArgumentException("Trying to create 0 dimension vector"); } if (STORE_AS_DENSE) { currRowVector = new NamedVector(new DenseVector(dimensions), outputKey.toString()); } else if (STORE_AS_SEQUENTIAL) { currRowVector = new NamedVector(new SequentialAccessSparseVector(dimensions, currRow.size()), outputKey.toString()); } else { currRowVector = new NamedVector(new RandomAccessSparseVector(dimensions, currRow.size()), outputKey.toString()); } for (int ii = 0; ii < currRow.size(); ii++) { Object o = currRow.get(ii); switch (currRow.getType(ii)) { case DataType.INTEGER: case DataType.LONG: case DataType.FLOAT: case DataType.DOUBLE: currRowVector.set(ii, (Double) o); break; case DataType.TUPLE: // If this is a tuple then we want to set column and element Tuple subt = (Tuple) o; currRowVector.set((Integer) subt.get(0), (Double) subt.get(1)); break; default: throw new RuntimeException("Unexpected tuple form"); } } outputValue.set(currRowVector); try { writer.write(outputKey, outputValue); } catch (InterruptedException e) { LOG.error("Interrupted while writing", e); } }
From source file:csvToSequence.ConvertToSeqLargeTxtVec.java
public static void main(String[] args) throws IOException { String filename = "/home/ivan/WorkDir/ccFraud.csv"; String outputfilename = "/home/ivan/WorkDir/part-0000"; SequenceFile.Writer writer;// w w w . ja va 2s .com Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); Path path = new Path(outputfilename); writer = new SequenceFile.Writer(fs, conf, path, Text.class, VectorWritable.class); VectorWritable vec = new VectorWritable(); BufferedReader br = new BufferedReader(new FileReader(filename)); String s; br.readLine(); //skip line while ((s = br.readLine()) != null) { String[] value = s.split(","); double[] numValue = new double[8]; for (int i = 0; i < 8; i++) numValue[i] = Double.parseDouble(value[i]); if (Integer.parseInt(value[8]) == 1) value[8] = "Fraud/" + value[8]; else value[8] = "Normal/" + value[8]; NamedVector oneV = new NamedVector(new DenseVector(numValue), value[8]); vec.set(oneV.getDelegate()); writer.append(new Text(oneV.getName()), vec); } writer.close(); }
From source file:csvToSequence.ConvertToSeqTextVecWritable.java
public static void main(String[] args) throws FileNotFoundException, IOException { String filename = "/home/ivan/WorkDir/ccFraud.csv"; String outputfilename = "/home/ivan/WorkDir/part-0000"; SequenceFile.Writer writer;/*from w w w .j a va 2s.c om*/ Configuration conf = new Configuration(); List<NamedVector> namedVectors = new ArrayList<>(); /*Integer i = 1; CSVVectorIterator vectorCSVVectorIterator = new CSVVectorIterator(new FileReader(filename)); //System.out.println("Densvector"+vec.next()): while(vectorCSVVectorIterator.hasNext()){ NamedVector vecIt = new NamedVector(vectorCSVVectorIterator.next(),i.toString()); namedVectors.add(vecIt); i++; }*/ BufferedReader br = new BufferedReader(new FileReader(filename)); String s; br.readLine(); //skip line while ((s = br.readLine()) != null) { String[] value = s.split(","); double[] numValue = new double[8]; for (int i = 0; i < 8; i++) numValue[i] = Double.parseDouble(value[i]); if (Integer.parseInt(value[8]) == 1) value[8] = "Fraud/" + value[8]; else value[8] = "Normal/" + value[8]; NamedVector oneV = new NamedVector(new DenseVector(numValue), value[8]); namedVectors.add(oneV); } FileSystem fs = FileSystem.get(conf); Path path = new Path(outputfilename); writer = new SequenceFile.Writer(fs, conf, path, Text.class, VectorWritable.class); VectorWritable vec = new VectorWritable(); for (NamedVector iter : namedVectors) { vec.set(iter.getDelegate()); writer.append(new Text(iter.getName()), vec); } writer.close(); /*try (SequenceFile.Reader reader = new SequenceFile.Reader(fs,path, conf)) { Text key = new Text(); VectorWritable value = new VectorWritable(); while (reader.next(key, value)) { System.out.println(key + " "+ value); } }*/ }
From source file:edu.dfci.cccb.mev.kmeans.domain.hadoop.HadoopKMeansBuilder.java
License:Open Source License
@Override public KMeans build() throws DatasetException { try (TemporaryFolder hadoop = new TemporaryFolder()) { File points = new File(hadoop, "points"); points.mkdir();/*from w w w .j a va2s . c o m*/ Configuration configuration = new Configuration(); FileSystem system = get(configuration); final Dimension other = dataset().dimension(dimension().type() == ROW ? COLUMN : ROW); List<NamedVector> vectors = new AbstractList<NamedVector>() { @Override public NamedVector get(int index) { final String vector = dimension().keys().get(index); return new NamedVector(new AbstractVector(other.keys().size()) { @Override public void setQuick(int index, double value) { throw new UnsupportedOperationException(); } @Override public Vector like() { return new RandomAccessSparseVector(size()); } @Override public Iterator<Element> iterator() { return new Iterator<Element>() { private int current = 0; @Override public boolean hasNext() { return current < other.keys().size(); } @Override public Element next() { return new Element() { private final int index = current++; @Override public void set(double value) { throw new UnsupportedOperationException(); } @Override public int index() { return index; } @Override @SneakyThrows(InvalidCoordinateException.class) public double get() { return dimension().type() == ROW ? dataset().values().get(vector, other.keys().get(index)) : dataset().values().get(other.keys().get(index), vector); } }; } @Override public void remove() { throw new UnsupportedOperationException(); } }; } @Override public Iterator<Element> iterateNonZero() { return iterator(); } @Override public boolean isSequentialAccess() { return true; } @Override public boolean isDense() { return true; } @Override @SneakyThrows(InvalidCoordinateException.class) public double getQuick(int index) { return dimension().type() == ROW ? dataset().values().get(vector, other.keys().get(index)) : dataset().values().get(other.keys().get(index), vector); } @Override public int getNumNondefaultElements() { return other.keys().size(); } @Override protected Matrix matrixLike(int rows, int columns) { throw new UnsupportedOperationException(); } }, vector); } @Override public int size() { return dimension().keys().size(); } }; // write input try (Writer writer = new Writer(system, configuration, new Path(new File(points, "file1").getAbsolutePath()), LongWritable.class, VectorWritable.class)) { VectorWritable writable = new VectorWritable(); long record = 0; for (Vector vector : vectors) { writable.set(vector); writer.append(new LongWritable(record++), writable); } } // prepare clusters File clusters = new File(hadoop, "clusters"); clusters.mkdir(); try (Writer writer = new Writer(system, configuration, new Path(new File(clusters, "part-00000").getAbsolutePath()), Text.class, Cluster.class)) { for (int i = 0; i < k(); i++) { Vector vec = vectors.get(i); Cluster cluster = new Cluster(vec, i, new EuclideanDistanceMeasure()); writer.append(new Text(cluster.getIdentifier()), cluster); } } File output = new File(hadoop, "output"); output.mkdir(); try { run(configuration, new Path(points.getAbsolutePath()), new Path(clusters.getAbsolutePath()), new Path(output.getAbsolutePath()), metric.measurer(), convergence(), iterations(), true, false); try (Reader reader = new Reader(system, new Path( new File(new File(output, CLUSTERED_POINTS_DIR), "/part-m-00000").getAbsolutePath()), configuration)) { IntWritable key = new IntWritable(); WeightedVectorWritable value = new WeightedVectorWritable(); Map<String, Set<String>> result = new HashMap<>(); while (reader.next(key, value)) { Set<String> cluster = result.get(key.toString()); if (cluster == null) result.put(key.toString(), cluster = new HashSet<>()); cluster.add(((NamedVector) value.getVector()).getName()); } return new AbstractKMeans() { }.dataset(dataset()).dimension(dimension()).name(name()).type(type()) .clusters(new HashSet<>(result.values())); } } catch (ClassNotFoundException | InterruptedException e) { throw new DatasetException(e); } } catch (IOException e) { throw new DatasetException(e); } }
From source file:edu.indiana.d2i.htrc.io.index.solr.SolrClient.java
License:Apache License
private NamedVector parseOneVolume(InputStream content) throws XMLStreamException, IOException { // java.io.BufferedReader br = new java.io.BufferedReader(new java.io.InputStreamReader(content)); // String line = ""; // while ((line = br.readLine()) != null) { // System.out.println(line); // }/*from w w w . ja v a 2 s . co m*/ // br.close(); String volumeID = null; Vector vector = null; XMLStreamReader parser = factory.createXMLStreamReader(content); while (parser.hasNext()) { int event = parser.next(); if (event == XMLStreamConstants.START_ELEMENT) { String attributeValue = parser.getAttributeValue(null, "name"); if (attributeValue != null) { if (attributeValue.equals(VOLUME_ID)) { volumeID = parser.getElementText(); volumeID = pairtree.uncleanId(volumeID); } else if (attributeValue.equals(VOLUME_OCR)) { vector = createVector(parser); break; } } } } NamedVector tv = new NamedVector(vector, volumeID); return tv; }