List of usage examples for org.apache.mahout.math NamedVector getName
public String getName()
From source file:com.lakhani.anchorgraph.applestovectors.java
public static void main(String args[]) throws Exception { List<NamedVector> apples = new ArrayList<NamedVector>(); NamedVector apple;//from w w w. j av a 2s . co m apple = new NamedVector(new DenseVector(new double[] { 0.11, 510, 1 }), "Small round green apple"); apples.add(apple); apple = new NamedVector(new DenseVector(new double[] { 0.23, 650, 3 }), "Large oval red apple"); apples.add(apple); apple = new NamedVector(new DenseVector(new double[] { 0.09, 630, 1 }), "Small elongated red apple"); apples.add(apple); apple = new NamedVector(new DenseVector(new double[] { 0.25, 590, 3 }), "Large round yellow apple"); apples.add(apple); apple = new NamedVector(new DenseVector(new double[] { 0.18, 520, 2 }), "Medium oval green apple"); Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); Path path = new Path("/user/cloudera/anchorgraph/output"); SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, path, Text.class, VectorWritable.class); VectorWritable vec = new VectorWritable(); for (NamedVector vector : apples) { vec.set(vector); writer.append(new Text(vector.getName()), vec); } writer.close(); SequenceFile.Reader reader = new SequenceFile.Reader(fs, new Path("appledata/apples"), conf); Text key = new Text(); VectorWritable value = new VectorWritable(); while (reader.next(key, value)) { System.out.println(key.toString() + " " + value.get().asFormatString()); } reader.close(); }
From source file:com.mozilla.grouperfish.mahout.clustering.display.kmeans.OriginalText.java
License:Apache License
public Map<Integer, Set<String>> getDocIds(double sampleRate) { Random rand = new Random(); Map<Integer, Set<String>> docIdMap = new HashMap<Integer, Set<String>>(); SequenceFileDirectoryReader pointsReader = null; try {//from w w w . j a v a 2 s. c o m IntWritable k = new IntWritable(); WeightedVectorWritable wvw = new WeightedVectorWritable(); pointsReader = new SequenceFileDirectoryReader(clusteredPointsPath); while (pointsReader.next(k, wvw)) { int clusterId = k.get(); Vector v = wvw.getVector(); if (v instanceof NamedVector) { if (rand.nextDouble() < sampleRate) { NamedVector nv = (NamedVector) v; nv.getName(); Set<String> curDocIds = docIdMap.get(clusterId); if (curDocIds == null) { curDocIds = new HashSet<String>(); } curDocIds.add(nv.getName()); docIdMap.put(clusterId, curDocIds); } } } } catch (IOException e) { LOG.error("IOException caught while reading clustered points", e); } finally { if (pointsReader != null) { pointsReader.close(); } } return docIdMap; }
From source file:com.mozilla.grouperfish.transforms.coclustering.display.WriteCoClusteringOutput.java
License:Apache License
private void loadPoints() throws IOException { SequenceFile.Reader currReader = null; IntWritable k = new IntWritable(); CoCluster currCluster;/* w w w .jav a 2s . c o m*/ int currVID; WeightedVectorWritable wvw = new WeightedVectorWritable(); try { fs = FileSystem.get(clusteredPointsPath.toUri(), conf); for (FileStatus status : fs.listStatus(clusteredPointsPath)) { Path p = status.getPath(); if (!status.isDir() && !p.getName().startsWith("_")) { try { currReader = new SequenceFile.Reader(fs, p, conf); while (currReader.next(k, wvw)) { currCluster = coclusters.get(k.get()); NamedVector v = (NamedVector) wvw.getVector(); currVID = Integer.parseInt(v.getName()); if (docIDMap.containsKey(currVID)) { currCluster.put(v, docIDMap.get(currVID), true); } else if (featureIDMap.containsKey(currVID)) { currCluster.put(v, featureIDMap.get(currVID), false); } else { LOG.error("Key not feature or document!"); } } } finally { if (currReader != null) { IOUtils.closeStream(currReader); } } } } } catch (IOException ie) { LOG.info("Error while reading points", ie); } catch (ClassCastException ce) { LOG.info("NamedVectors possibly not used", ce); } finally { if (currReader != null) { IOUtils.closeStream(currReader); } if (fs != null) { fs.close(); } } }
From source file:com.mozilla.grouperfish.transforms.coclustering.pig.storage.KMeansOutputLoader.java
License:Apache License
@Override public Tuple getNext() throws IOException { try {/*from w w w .j ava 2s. com*/ if (!this.reader.nextKeyValue()) { return null; } Tuple currRow = tupleFactory.newTuple(3); DataBag rowInfoBag = bagFactory.newDefaultBag(); IntWritable key = (IntWritable) reader.getCurrentKey(); int clusterID = key.get(); WeightedVectorWritable value = (WeightedVectorWritable) reader.getCurrentValue(); Vector rowInfo = value.getVector(); NamedVector nrowInfo = (NamedVector) rowInfo; int vectorID = Integer.parseInt(nrowInfo.getName()); for (Iterator<Vector.Element> itr = rowInfo.iterateNonZero(); itr.hasNext();) { Vector.Element elemInfo = itr.next(); Tuple currElement = tupleFactory.newTuple(2); currElement.set(0, elemInfo.index()); currElement.set(1, elemInfo.get()); rowInfoBag.add(currElement); } currRow.set(0, clusterID); currRow.set(1, vectorID); currRow.set(2, rowInfoBag); return currRow; } catch (InterruptedException ie) { LOG.error("Interrupted while reading", ie); throw new IOException(ie); } catch (NumberFormatException ne) { LOG.error("Possible use of non int values for NamedVector keys", ne); throw new IOException(ne); } catch (ClassCastException e) { LOG.error("Possible cast of normal Vector to NamedVector", e); throw new IOException(e); } }
From source file:com.mozilla.grouperfish.transforms.coclustering.pig.storage.MahoutVectorStorage.java
License:Apache License
@Override public Tuple getNext() throws IOException { try {//from ww w .ja v a2s.c om if (!this.reader.nextKeyValue()) { return null; } Tuple currRow = tupleFactory.newTuple(2); DataBag rowInfoBag = bagFactory.newDefaultBag(); IntWritable key = reader.getCurrentKey(); int rowID = key.get(); VectorWritable value = reader.getCurrentValue(); Vector rowInfo = value.get(); if (rowInfo instanceof NamedVector) { NamedVector nrowInfo = (NamedVector) rowInfo; rowID = Integer.parseInt(nrowInfo.getName()); } for (Iterator<Vector.Element> itr = rowInfo.iterateNonZero(); itr.hasNext();) { Vector.Element elemInfo = itr.next(); Tuple currElement = tupleFactory.newTuple(2); currElement.set(0, elemInfo.index()); currElement.set(1, elemInfo.get()); rowInfoBag.add(currElement); } currRow.set(0, rowID); currRow.set(1, rowInfoBag); return currRow; } catch (InterruptedException ie) { LOG.error("Interrupted while reading", ie); throw new IOException(ie); } catch (NumberFormatException ne) { LOG.error("Possible use of non int values for NamedVector keys", ne); throw new IOException(ne); } }
From source file:csvToSequence.ConvertToSeqLargeTxtVec.java
public static void main(String[] args) throws IOException { String filename = "/home/ivan/WorkDir/ccFraud.csv"; String outputfilename = "/home/ivan/WorkDir/part-0000"; SequenceFile.Writer writer;/*from w ww. j a va2 s . c o m*/ Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); Path path = new Path(outputfilename); writer = new SequenceFile.Writer(fs, conf, path, Text.class, VectorWritable.class); VectorWritable vec = new VectorWritable(); BufferedReader br = new BufferedReader(new FileReader(filename)); String s; br.readLine(); //skip line while ((s = br.readLine()) != null) { String[] value = s.split(","); double[] numValue = new double[8]; for (int i = 0; i < 8; i++) numValue[i] = Double.parseDouble(value[i]); if (Integer.parseInt(value[8]) == 1) value[8] = "Fraud/" + value[8]; else value[8] = "Normal/" + value[8]; NamedVector oneV = new NamedVector(new DenseVector(numValue), value[8]); vec.set(oneV.getDelegate()); writer.append(new Text(oneV.getName()), vec); } writer.close(); }
From source file:csvToSequence.ConvertToSeqTextVecWritable.java
public static void main(String[] args) throws FileNotFoundException, IOException { String filename = "/home/ivan/WorkDir/ccFraud.csv"; String outputfilename = "/home/ivan/WorkDir/part-0000"; SequenceFile.Writer writer;/*from www. j a va 2s . c o m*/ Configuration conf = new Configuration(); List<NamedVector> namedVectors = new ArrayList<>(); /*Integer i = 1; CSVVectorIterator vectorCSVVectorIterator = new CSVVectorIterator(new FileReader(filename)); //System.out.println("Densvector"+vec.next()): while(vectorCSVVectorIterator.hasNext()){ NamedVector vecIt = new NamedVector(vectorCSVVectorIterator.next(),i.toString()); namedVectors.add(vecIt); i++; }*/ BufferedReader br = new BufferedReader(new FileReader(filename)); String s; br.readLine(); //skip line while ((s = br.readLine()) != null) { String[] value = s.split(","); double[] numValue = new double[8]; for (int i = 0; i < 8; i++) numValue[i] = Double.parseDouble(value[i]); if (Integer.parseInt(value[8]) == 1) value[8] = "Fraud/" + value[8]; else value[8] = "Normal/" + value[8]; NamedVector oneV = new NamedVector(new DenseVector(numValue), value[8]); namedVectors.add(oneV); } FileSystem fs = FileSystem.get(conf); Path path = new Path(outputfilename); writer = new SequenceFile.Writer(fs, conf, path, Text.class, VectorWritable.class); VectorWritable vec = new VectorWritable(); for (NamedVector iter : namedVectors) { vec.set(iter.getDelegate()); writer.append(new Text(iter.getName()), vec); } writer.close(); /*try (SequenceFile.Reader reader = new SequenceFile.Reader(fs,path, conf)) { Text key = new Text(); VectorWritable value = new VectorWritable(); while (reader.next(key, value)) { System.out.println(key + " "+ value); } }*/ }
From source file:edu.indiana.d2i.htrc.io.index.solr.SequentialVectorFromSolr.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 4) { printUsage();/*from www.j av a 2 s .c om*/ } String solrURL = args[0]; String dictionaryFile = args[1]; String idsFile = args[2]; String outputFile = args[3]; logger.info("SequentialVectorFromSolr "); logger.info(" - solrURL: " + solrURL); logger.info(" - dictionaryFile: " + dictionaryFile); logger.info(" - idsFile: " + idsFile); // on HDFS logger.info(" - outputFile: " + outputFile); // on HDFS Configuration conf = getConf(); // conf.set(HTRCConstants.SOLR_MAIN_URL, solrURL); conf.set("htrc.solr.url", solrURL); conf.set(HTRCConstants.DICTIONARY_PATH, dictionaryFile); SolrClient client = new SolrClient(conf, true); FileSystem fs = FileSystem.get(conf); SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, new Path(outputFile), Text.class, VectorWritable.class); long t0 = System.nanoTime(); DataInputStream fsinput = new DataInputStream(fs.open(new Path(idsFile))); BufferedReader reader = new BufferedReader(new InputStreamReader(fsinput)); String line = null; String[] ids = new String[1]; VectorWritable value = new VectorWritable(); Text key = new Text(); int count = 0; while ((line = reader.readLine()) != null) { ids[0] = line; Iterable<NamedVector> termVectors = client.getTermVectors(ids); for (NamedVector namedVector : termVectors) { value.set(namedVector); key.set(namedVector.getName()); writer.append(key, value); count++; } if (count % 1000 == 0) System.out.println("Finish " + count + " volumes."); } long t1 = System.nanoTime(); System.out.println("Takes " + (t1 - t0) / 1e9 + " seconds"); writer.close(); reader.close(); return 0; }
From source file:hk.newsRecommender.MatrixAndCluster.java
License:Open Source License
public static void clusterOutput(Configuration conf, Path path) { try {// w w w . jav a 2 s .c o m BufferedWriter bw; FileSystem fs = FileSystem.get(conf); SequenceFile.Reader reader = null; reader = new SequenceFile.Reader(fs, path, conf); // ?uidOfgrp.txt?? uid \t groupID bw = new BufferedWriter(new FileWriter(new File("C:\\Users\\Hk\\Desktop\\ClusterPointsInfo.txt"))); HashMap<String, Integer> clusterIds; clusterIds = new HashMap<String, Integer>(120); IntWritable key = new IntWritable(); WeightedPropertyVectorWritable value = new WeightedPropertyVectorWritable(); // WeightedVectorWritable value = new WeightedVectorWritable(); while (reader.next(key, value)) { NamedVector vector = (NamedVector) value.getVector(); // VectorName String vectorName = vector.getName(); System.out.println(vectorName + "\t" + key.toString()); bw.write(vectorName + "\t" + key.toString() + "\n"); // ?group? if (clusterIds.containsKey(key.toString())) { clusterIds.put(key.toString(), clusterIds.get(key.toString()) + 1); } else clusterIds.put(key.toString(), 1); } bw.flush(); reader.close(); // ?group?grpSize bw = new BufferedWriter(new FileWriter(new File("C:\\Users\\Hk\\Desktop\\ClusterPointsSize.txt"))); Set<String> keys = clusterIds.keySet(); for (String k : keys) { System.out.println(k + " " + clusterIds.get(k)); bw.write(k + " " + clusterIds.get(k) + "\n"); } bw.flush(); bw.close(); } catch (IOException e) { e.printStackTrace(); } }
From source file:org.qcri.pca.MahoutCompatibilityTest.java
License:Apache License
private static void doTestVectorWritableEquals(Vector v) throws IOException { Writable vectorWritable = new VectorWritable(v); VectorWritable vectorWritable2 = new VectorWritable(); writeAndRead(vectorWritable, vectorWritable2); Vector v2 = vectorWritable2.get(); if (v instanceof NamedVector) { assertTrue(v2 instanceof NamedVector); NamedVector nv = (NamedVector) v; NamedVector nv2 = (NamedVector) v2; assertEquals(nv.getName(), nv2.getName()); assertEquals("Victor", nv.getName()); }// w w w. j a va2 s. co m assertEquals(v, v2); }