List of usage examples for org.apache.mahout.math VectorWritable set
public void set(Vector vector)
From source file:at.illecker.hama.rootbeer.examples.matrixmultiplication.compositeinput.cpu.MatrixMultiplicationBSPCpu.java
License:Apache License
@Override public void bsp(BSPPeer<IntWritable, TupleWritable, IntWritable, VectorWritable, MatrixRowMessage> peer) throws IOException, SyncException, InterruptedException { IntWritable key = new IntWritable(); TupleWritable value = new TupleWritable(); while (peer.readNext(key, value)) { // Logging if (isDebuggingEnabled) { for (int i = 0; i < value.size(); i++) { Vector vector = ((VectorWritable) value.get(i)).get(); logger.writeChars("bsp,input,key=" + key + ",value=" + vector.toString() + "\n"); }//w ww . j a v a 2 s. c om } Vector firstVector = ((VectorWritable) value.get(0)).get(); Vector secondVector = ((VectorWritable) value.get(1)).get(); // outCardinality is resulting column size n // (l x m) * (m x n) = (l x n) boolean firstIsOutFrag = secondVector.size() == outCardinality; // outFrag is Matrix which has the resulting column cardinality // (matrixB) Vector outFrag = firstIsOutFrag ? secondVector : firstVector; // multiplier is Matrix which has the resulting row count // (transposed matrixA) Vector multiplier = firstIsOutFrag ? firstVector : secondVector; if (isDebuggingEnabled) { logger.writeChars("bsp,firstIsOutFrag=" + firstIsOutFrag + "\n"); logger.writeChars("bsp,outFrag=" + outFrag + "\n"); logger.writeChars("bsp,multiplier=" + multiplier + "\n"); } for (Vector.Element e : multiplier.nonZeroes()) { VectorWritable outVector = new VectorWritable(); // Scalar Multiplication (Vector x Element) outVector.set(outFrag.times(e.get())); peer.send(masterTask, new MatrixRowMessage(e.index(), outVector)); if (isDebuggingEnabled) { logger.writeChars("bsp,send,key=" + e.index() + ",value=" + outVector.get().toString() + "\n"); } } if (isDebuggingEnabled) { logger.flush(); } } peer.sync(); }
From source file:com.chimpler.example.eigenface.Helper.java
License:Apache License
public static void writeMatrixSequenceFile(String matrixSeqFileName, double[][] covarianceMatrix) throws Exception { int rowCount = covarianceMatrix.length; int columnCount = covarianceMatrix[0].length; Configuration configuration = new Configuration(); FileSystem fs = FileSystem.get(configuration); Writer matrixWriter = new SequenceFile.Writer(fs, configuration, new Path(matrixSeqFileName), IntWritable.class, VectorWritable.class); IntWritable key = new IntWritable(); VectorWritable value = new VectorWritable(); double[] doubleValues = new double[columnCount]; for (int i = 0; i < rowCount; i++) { key.set(i);/* w w w . j ava 2s . com*/ for (int j = 0; j < columnCount; j++) { doubleValues[j] = covarianceMatrix[i][j]; } Vector vector = new DenseVector(doubleValues); value.set(vector); matrixWriter.append(key, value); } matrixWriter.close(); }
From source file:com.lakhani.anchorgraph.applestovectors.java
public static void main(String args[]) throws Exception { List<NamedVector> apples = new ArrayList<NamedVector>(); NamedVector apple;//w w w .ja v a 2s. co m apple = new NamedVector(new DenseVector(new double[] { 0.11, 510, 1 }), "Small round green apple"); apples.add(apple); apple = new NamedVector(new DenseVector(new double[] { 0.23, 650, 3 }), "Large oval red apple"); apples.add(apple); apple = new NamedVector(new DenseVector(new double[] { 0.09, 630, 1 }), "Small elongated red apple"); apples.add(apple); apple = new NamedVector(new DenseVector(new double[] { 0.25, 590, 3 }), "Large round yellow apple"); apples.add(apple); apple = new NamedVector(new DenseVector(new double[] { 0.18, 520, 2 }), "Medium oval green apple"); Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); Path path = new Path("/user/cloudera/anchorgraph/output"); SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, path, Text.class, VectorWritable.class); VectorWritable vec = new VectorWritable(); for (NamedVector vector : apples) { vec.set(vector); writer.append(new Text(vector.getName()), vec); } writer.close(); SequenceFile.Reader reader = new SequenceFile.Reader(fs, new Path("appledata/apples"), conf); Text key = new Text(); VectorWritable value = new VectorWritable(); while (reader.next(key, value)) { System.out.println(key.toString() + " " + value.get().asFormatString()); } reader.close(); }
From source file:com.mozilla.grouperfish.transforms.coclustering.pig.storage.MahoutVectorStorage.java
License:Apache License
@Override public void putNext(Tuple t) throws IOException { IntWritable outputKey = new IntWritable(); VectorWritable outputValue = new VectorWritable(); outputKey.set((Integer) t.get(0)); Tuple currRow = (Tuple) t.get(1);/* www. j a va 2 s .co m*/ Vector currRowVector; if (dimensions == 0) { throw new IllegalArgumentException("Trying to create 0 dimension vector"); } if (STORE_AS_DENSE) { currRowVector = new NamedVector(new DenseVector(dimensions), outputKey.toString()); } else if (STORE_AS_SEQUENTIAL) { currRowVector = new NamedVector(new SequentialAccessSparseVector(dimensions, currRow.size()), outputKey.toString()); } else { currRowVector = new NamedVector(new RandomAccessSparseVector(dimensions, currRow.size()), outputKey.toString()); } for (int ii = 0; ii < currRow.size(); ii++) { Object o = currRow.get(ii); switch (currRow.getType(ii)) { case DataType.INTEGER: case DataType.LONG: case DataType.FLOAT: case DataType.DOUBLE: currRowVector.set(ii, (Double) o); break; case DataType.TUPLE: // If this is a tuple then we want to set column and element Tuple subt = (Tuple) o; currRowVector.set((Integer) subt.get(0), (Double) subt.get(1)); break; default: throw new RuntimeException("Unexpected tuple form"); } } outputValue.set(currRowVector); try { writer.write(outputKey, outputValue); } catch (InterruptedException e) { LOG.error("Interrupted while writing", e); } }
From source file:com.twitter.algebra.AlgebraCommon.java
License:Apache License
/** * Convert an in-memory representation of a matrix to a distributed MapDir * format. It then can be used in distributed jobs * /*from w ww . j a v a2 s.c o m*/ * @param oriMatrix * @return path that will contain the matrix files * @throws Exception */ public static DistributedRowMatrix toMapDir(Matrix origMatrix, Path outPath, Path tmpPath, String label) throws Exception { Configuration conf = new Configuration(); Path outputDir = new Path(outPath, label + origMatrix.numRows() + "x" + origMatrix.numCols()); FileSystem fs = FileSystem.get(outputDir.toUri(), conf); if (!fs.exists(outputDir)) { Path mapDir = new Path(outputDir, "matrix-k-0"); Path outputFile = new Path(mapDir, "data"); @SuppressWarnings("deprecation") SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, outputFile, IntWritable.class, VectorWritable.class); VectorWritable vectorw = new VectorWritable(); IntWritable intw = new IntWritable(); try { for (int r = 0; r < origMatrix.numRows(); r++) { Vector vector = origMatrix.viewRow(r); vectorw.set(vector); intw.set(r); writer.append(intw, vectorw); } } finally { writer.close(); } MapFile.fix(fs, mapDir, IntWritable.class, VectorWritable.class, false, conf); } else { log.warn("----------- Skip matrix " + outputDir + " - already exists"); } DistributedRowMatrix dMatrix = new DistributedRowMatrix(outputDir, tmpPath, origMatrix.numRows(), origMatrix.numCols()); dMatrix.setConf(conf); return dMatrix; }
From source file:com.twitter.algebra.AlgebraCommon.java
License:Apache License
/** * Write a vector to filesystem so that it can be used by distributed jobs * @param vector/* w ww . j av a 2s . c o m*/ * @param outputDir * @param label the unique label that be used in naming the vector file * @param conf * @return * @throws IOException */ public static Path toDistributedVector(Vector vector, Path outputDir, String label, Configuration conf) throws IOException { Path outputFile = new Path(outputDir, "Vector-" + label); FileSystem fs = FileSystem.get(outputDir.toUri(), conf); if (fs.exists(outputFile)) { log.warn("----------- OVERWRITE " + outputFile + " already exists"); fs.delete(outputFile, false); } @SuppressWarnings("deprecation") SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, outputFile, IntWritable.class, VectorWritable.class); VectorWritable vectorw = new VectorWritable(); vectorw.set(vector); writer.append(new IntWritable(0), vectorw); writer.close(); return outputFile; }
From source file:com.twitter.algebra.matrix.format.MapDir.java
License:Apache License
/** * Get the value associated with the key * @param key/* w w w . ja va2 s . c o m*/ * @param val the object that will be filled with the retrieved value * @return the retrieved value * @throws IOException */ public VectorWritable get(IntWritable key, VectorWritable val) throws IOException { if (lastReader == null && noMorePartitions) return null; if (lastReader == null) { loadReader(key); nextKey.set(key.get()); boolean eof = lastReader.getClosest(nextKey, nextValue, true) == null; if (eof) { lastReader = null; return null; } } boolean eof = false; //skip over keys until find the one that the user is asking for. This should rarely //occur as the user normally asks for sequential keys while (!eof && nextKey.compareTo(key) < 0) eof = !lastReader.next(nextKey, nextValue); //If the requested key is not in the current MapFile, reset the process and //search in the next MapFile using recursive call if (eof) { lastReader = null; return get(key, val); } if (nextKey.equals(key)) { val.set(nextValue.get()); //update nextKey and nextValue for the next call eof = !lastReader.next(nextKey, nextValue); if (eof) lastReader = null; return val; } return null; }
From source file:csvToSequence.ConvertToSeqLargeTxtVec.java
public static void main(String[] args) throws IOException { String filename = "/home/ivan/WorkDir/ccFraud.csv"; String outputfilename = "/home/ivan/WorkDir/part-0000"; SequenceFile.Writer writer;/*from ww w . ja v a 2 s . c o m*/ Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); Path path = new Path(outputfilename); writer = new SequenceFile.Writer(fs, conf, path, Text.class, VectorWritable.class); VectorWritable vec = new VectorWritable(); BufferedReader br = new BufferedReader(new FileReader(filename)); String s; br.readLine(); //skip line while ((s = br.readLine()) != null) { String[] value = s.split(","); double[] numValue = new double[8]; for (int i = 0; i < 8; i++) numValue[i] = Double.parseDouble(value[i]); if (Integer.parseInt(value[8]) == 1) value[8] = "Fraud/" + value[8]; else value[8] = "Normal/" + value[8]; NamedVector oneV = new NamedVector(new DenseVector(numValue), value[8]); vec.set(oneV.getDelegate()); writer.append(new Text(oneV.getName()), vec); } writer.close(); }
From source file:csvToSequence.ConvertToSeqTextVecWritable.java
public static void main(String[] args) throws FileNotFoundException, IOException { String filename = "/home/ivan/WorkDir/ccFraud.csv"; String outputfilename = "/home/ivan/WorkDir/part-0000"; SequenceFile.Writer writer;/*from ww w .j a va 2 s. c o m*/ Configuration conf = new Configuration(); List<NamedVector> namedVectors = new ArrayList<>(); /*Integer i = 1; CSVVectorIterator vectorCSVVectorIterator = new CSVVectorIterator(new FileReader(filename)); //System.out.println("Densvector"+vec.next()): while(vectorCSVVectorIterator.hasNext()){ NamedVector vecIt = new NamedVector(vectorCSVVectorIterator.next(),i.toString()); namedVectors.add(vecIt); i++; }*/ BufferedReader br = new BufferedReader(new FileReader(filename)); String s; br.readLine(); //skip line while ((s = br.readLine()) != null) { String[] value = s.split(","); double[] numValue = new double[8]; for (int i = 0; i < 8; i++) numValue[i] = Double.parseDouble(value[i]); if (Integer.parseInt(value[8]) == 1) value[8] = "Fraud/" + value[8]; else value[8] = "Normal/" + value[8]; NamedVector oneV = new NamedVector(new DenseVector(numValue), value[8]); namedVectors.add(oneV); } FileSystem fs = FileSystem.get(conf); Path path = new Path(outputfilename); writer = new SequenceFile.Writer(fs, conf, path, Text.class, VectorWritable.class); VectorWritable vec = new VectorWritable(); for (NamedVector iter : namedVectors) { vec.set(iter.getDelegate()); writer.append(new Text(iter.getName()), vec); } writer.close(); /*try (SequenceFile.Reader reader = new SequenceFile.Reader(fs,path, conf)) { Text key = new Text(); VectorWritable value = new VectorWritable(); while (reader.next(key, value)) { System.out.println(key + " "+ value); } }*/ }
From source file:de.tuberlin.dima.cuttlefish.preprocessing.vectorization.Vectorizer.java
License:Open Source License
public void vectorize(File luceneIndexDir, File outputDir) throws Exception { Configuration conf = new Configuration(); FileSystem fs = FileSystem.getLocal(conf); SequenceFile.Writer writer = null; FeatureDictionary dict = new FeatureDictionary(); DirectoryReader reader = null;//from ww w . j a v a 2s. c o m try { reader = DirectoryReader.open(new SimpleFSDirectory(luceneIndexDir)); writer = SequenceFile.createWriter(fs, conf, new Path(outputDir.toString(), "documentVectors.seq"), IDAndCodes.class, VectorWritable.class); IDAndCodes idAndCodes = new IDAndCodes(); VectorWritable vectorWritable = new VectorWritable(); Fields fields = MultiFields.getFields(reader); if (fields != null) { Iterator<String> fieldNames = fields.iterator(); while (fieldNames.hasNext()) { String field = fieldNames.next(); if (!field.startsWith("bip:") && !"itemID".equals(field)) { Terms terms = fields.terms(field); TermsEnum termsEnum = terms.iterator(null); BytesRef text; while ((text = termsEnum.next()) != null) { dict.addTextFeature(field, text.utf8ToString()); } } } } int numDocsVectorized = 0; for (int docID = 0; docID < reader.maxDoc(); docID++) { Document doc = reader.document(docID); int itemID = doc.getField("itemID").numericValue().intValue(); RandomAccessSparseVector documentVector = new RandomAccessSparseVector(dict.numFeatures()); Multimap<String, String> codes = HashMultimap.create(); for (IndexableField field : doc.getFields()) { String fieldName = field.name(); if (!fieldName.startsWith("bip:") && !"itemID".equals(fieldName)) { Terms termFreqVector = reader.getTermVector(docID, fieldName); if (termFreqVector != null) { int maxTermFrequency = maxTermFrequency(termFreqVector); TermsEnum te = termFreqVector.iterator(null); BytesRef term; while ((term = te.next()) != null) { String termStr = term.utf8ToString(); int termFrequency = (int) te.totalTermFreq(); int documentFrequency = reader.docFreq(new Term(fieldName, term)); int numDocs = reader.numDocs(); double weight = weighting.weight(fieldName, termStr, termFrequency, documentFrequency, maxTermFrequency, numDocs); int featureIndex = dict.index(fieldName, term.utf8ToString()); documentVector.setQuick(featureIndex, weight); } } } else if (fieldName.startsWith("bip:")) { for (String value : doc.getValues(fieldName)) { codes.put(fieldName, value); } } } Vector featureVector = new SequentialAccessSparseVector(documentVector); weighting.normalize(featureVector); idAndCodes.set(itemID, codes); vectorWritable.set(featureVector); writer.append(idAndCodes, vectorWritable); numDocsVectorized++; if (numDocsVectorized % 100 == 0) { log.info("Vectorized {} documents", numDocsVectorized); } } log.info("Vectorized {} documents", numDocsVectorized); dict.writeToFile(new File(outputDir, "features.txt")); log.info("Wrote feature dictionary"); } finally { Closeables.close(reader, true); Closeables.close(writer, true); } }