List of usage examples for org.apache.mahout.clustering Cluster getCenter
Vector getCenter();
From source file:DisplayClustering.java
License:Apache License
protected static void plotClusters(Graphics2D g2) { int cx = CLUSTERS.size() - 1; for (List<Cluster> clusters : CLUSTERS) { g2.setStroke(new BasicStroke(cx == 0 ? 3 : 1)); g2.setColor(COLORS[Math.min(COLORS.length - 1, cx--)]); for (Cluster cluster : clusters) { plotEllipse(g2, cluster.getCenter(), cluster.getRadius().times(3)); }//w ww. j av a 2 s .com } }
From source file:DisplayClustering.java
License:Apache License
protected static List<Cluster> readClustersWritable(Path clustersIn) { List<Cluster> clusters = Lists.newArrayList(); Configuration conf = new Configuration(); for (ClusterWritable value : new SequenceFileDirValueIterable<ClusterWritable>(clustersIn, PathType.LIST, PathFilters.logsCRCFilter(), conf)) { Cluster cluster = value.getValue(); log.info("Reading Cluster:{} center:{} numPoints:{} radius:{}", cluster.getId(), AbstractCluster.formatVector(cluster.getCenter(), null), cluster.getNumObservations(), AbstractCluster.formatVector(cluster.getRadius(), null)); clusters.add(cluster);/*from w w w .j av a2 s. com*/ } return clusters; }
From source file:com.grantingersoll.intell.clustering.KMeansClusteringEngine.java
License:Apache License
@Override public NamedList cluster(SolrParams params) { NamedList result = new NamedList(); //check to see if we have new results try {//from ww w . ja v a2 s. c o m if (theFuture != null) { //see if we have new results, but don't wait too long for them ClusterJob job = theFuture.get(1, TimeUnit.MILLISECONDS); if (lastSuccessful != null) { //clean up the old ones //TODO: clean up the old dirs before switching lastSuccessful } lastSuccessful = job; theFuture = null; } else { } } catch (InterruptedException e) { log.error("Exception", e); } catch (ExecutionException e) { log.error("Exception", e); } catch (TimeoutException e) { log.error("Exception", e); } if (lastSuccessful != null) {//we have clusters //do we need the points? boolean includePoints = params.getBool(INCLUDE_POINTS, false); int clusterId = params.getInt(LIST_POINTS, Integer.MIN_VALUE); Map<Integer, List<String>> toPoints = lastSuccessful.clusterIdToPoints; String docId = params.get(IN_CLUSTER); if ((includePoints || clusterId != Integer.MIN_VALUE || docId != null) && toPoints == null) { //load the points try { toPoints = readPoints(new Path(lastSuccessful.jobDir + File.separator + "points"), lastSuccessful.conf); } catch (IOException e) { throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unable to load points: " + lastSuccessful); } } if (params.getBool(LIST_CLUSTERS)) { NamedList nl = new NamedList(); result.add("all", nl); Map<Integer, Cluster> clusterMap = lastSuccessful.clusters; if (clusterMap == null) { //we aren't caching, so load 'em up try { clusterMap = loadClusters(lastSuccessful); } catch (Exception e) { throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "unable to load the clusters from " + lastSuccessful); } } for (Cluster cluster : clusterMap.values()) { NamedList clusterNL = new NamedList(); nl.add(String.valueOf(cluster.getId()), clusterNL); clusterNL.add("numPoints", cluster.getNumPoints()); //TODO: better format? clusterNL.add("center", cluster.getCenter().asFormatString()); if (cluster.getRadius() != null) { clusterNL.add("radius", cluster.getRadius().asFormatString()); } if (includePoints) { List<String> points = toPoints.get(cluster.getId()); clusterNL.add("points", points); } } } if (docId != null) { } //TODO: support sending in multiple ids if (clusterId != Integer.MIN_VALUE) { List<String> points = lastSuccessful.clusterIdToPoints.get(clusterId); if (points != null) { result.add(String.valueOf(clusterId), points); } } } else if (params.getBool(BUILD, false)) { RefCounted<SolrIndexSearcher> refCnt = core.getSearcher(); int theK = params.getInt(K, 10); cluster(refCnt.get(), theK); refCnt.decref(); } return result; }
From source file:com.ikanow.infinit.e.processing.custom.utils.HadoopUtils.java
License:Open Source License
public static BasicDBList getBsonFromSequenceFile(CustomMapReduceJobPojo cmr, int nLimit, String fields) throws SAXException, IOException, ParserConfigurationException { BasicDBList dbl = new BasicDBList(); PropertiesManager props = new PropertiesManager(); Configuration conf = getConfiguration(props); Path pathDir = HadoopUtils.getPathForJob(cmr, conf, false); @SuppressWarnings({ "unchecked", "rawtypes" }) SequenceFileDirIterable<? extends Writable, ? extends Writable> seqFileDir = new SequenceFileDirIterable( pathDir, PathType.LIST, PathFilters.logsCRCFilter(), conf); // Very basic, only allow top level, 1 level of nesting, and field removal HashSet<String> fieldLookup = null; if (null != fields) { fieldLookup = new HashSet<String>(); String[] fieldArray = fields.split(","); for (String field : fieldArray) { String[] fieldDecomp = field.split(":"); fieldLookup.add(fieldDecomp[0]); }/*from w w w.j a va2 s . co m*/ } //TOTEST int nRecords = 0; for (Pair<? extends Writable, ? extends Writable> record : seqFileDir) { BasicDBObject element = new BasicDBObject(); // KEY Writable key = record.getFirst(); if (key instanceof org.apache.hadoop.io.Text) { org.apache.hadoop.io.Text writable = (org.apache.hadoop.io.Text) key; element.put("key", writable.toString()); } else if (key instanceof org.apache.hadoop.io.DoubleWritable) { org.apache.hadoop.io.DoubleWritable writable = (org.apache.hadoop.io.DoubleWritable) key; element.put("key", Double.toString(writable.get())); } else if (key instanceof org.apache.hadoop.io.IntWritable) { org.apache.hadoop.io.IntWritable writable = (org.apache.hadoop.io.IntWritable) key; element.put("key", Integer.toString(writable.get())); } else if (key instanceof org.apache.hadoop.io.LongWritable) { org.apache.hadoop.io.LongWritable writable = (org.apache.hadoop.io.LongWritable) key; element.put("key", Long.toString(writable.get())); } else if (key instanceof BSONWritable) { element.put("key", MongoDbUtil.convert((BSONWritable) key)); } // VALUE Writable value = record.getSecond(); if (value instanceof org.apache.hadoop.io.Text) { org.apache.hadoop.io.Text writable = (org.apache.hadoop.io.Text) value; element.put("value", writable.toString()); } else if (value instanceof org.apache.hadoop.io.DoubleWritable) { org.apache.hadoop.io.DoubleWritable writable = (org.apache.hadoop.io.DoubleWritable) value; element.put("value", Double.toString(writable.get())); } else if (value instanceof org.apache.hadoop.io.IntWritable) { org.apache.hadoop.io.IntWritable writable = (org.apache.hadoop.io.IntWritable) value; element.put("value", Integer.toString(writable.get())); } else if (value instanceof org.apache.hadoop.io.LongWritable) { org.apache.hadoop.io.LongWritable writable = (org.apache.hadoop.io.LongWritable) value; element.put("value", Long.toString(writable.get())); } else if (value instanceof BSONWritable) { element.put("value", MongoDbUtil.convert((BSONWritable) value)); } else if (value instanceof org.apache.mahout.math.VectorWritable) { Vector vec = ((org.apache.mahout.math.VectorWritable) value).get(); BasicDBList dbl2 = listFromMahoutVector(vec, "value", element); element.put("value", dbl2); } else if (value instanceof org.apache.mahout.clustering.classify.WeightedVectorWritable) { org.apache.mahout.clustering.classify.WeightedVectorWritable vecW = (org.apache.mahout.clustering.classify.WeightedVectorWritable) value; element.put("valueWeight", vecW.getWeight()); BasicDBList dbl2 = listFromMahoutVector(vecW.getVector(), "value", element); element.put("value", dbl2); } else if (value instanceof org.apache.mahout.clustering.iterator.ClusterWritable) { Cluster cluster = ((org.apache.mahout.clustering.iterator.ClusterWritable) value).getValue(); BasicDBObject clusterVal = new BasicDBObject(); clusterVal.put("center", listFromMahoutVector(cluster.getCenter(), "center", clusterVal)); clusterVal.put("radius", listFromMahoutVector(cluster.getRadius(), "radius", clusterVal)); element.put("value", clusterVal); } else { element.put("unknownValue", value.getClass().toString()); } // Check the fields settings: // Only handle a few... if (null != fieldLookup) { for (String fieldToRemove : fieldLookup) { if (fieldToRemove.startsWith("value.")) { fieldToRemove = fieldToRemove.substring(6); BasicDBObject nested = (BasicDBObject) element.get("value."); if (null != nested) { nested.remove(fieldToRemove); } } else { element.remove(fieldToRemove); } } //TOTEST } dbl.add(element); nRecords++; if ((nLimit > 0) && (nRecords >= nLimit)) { break; } } return dbl; }
From source file:com.modofo.molo.cluster.DisplayClustering.java
License:Apache License
protected static List<Cluster> readClustersWritable(Path clustersIn) { List<Cluster> clusters = Lists.newArrayList(); Configuration conf = new Configuration(); for (ClusterWritable value : new SequenceFileDirValueIterable<ClusterWritable>(clustersIn, PathType.LIST, PathFilters.logsCRCFilter(), conf)) { Cluster cluster = value.getValue(); log.info("Reading Cluster:{} center:{} numPoints:{} radius:{}", new Object[] { cluster.getId(), AbstractCluster.formatVector(cluster.getCenter(), null), cluster.getNumObservations(), AbstractCluster.formatVector(cluster.getRadius(), null) }); clusters.add(cluster);//from w w w . jav a2 s . c o m } return clusters; }
From source file:io.github.thushear.display.DisplayClustering.java
License:Apache License
protected static void plotClusters(Graphics2D g2) { int cx = CLUSTERS.size() - 1; for (List<Cluster> clusters : CLUSTERS) { g2.setStroke(new BasicStroke(cx == 0 ? 3 : 1)); g2.setColor(COLORS[Math.min(DisplayClustering.COLORS.length - 1, cx--)]); for (Cluster cluster : clusters) { plotEllipse(g2, cluster.getCenter(), cluster.getRadius().times(3)); }/*from www .j ava 2s.c om*/ } }
From source file:io.github.thushear.display.DisplayClustering.java
License:Apache License
protected static List<Cluster> readClusters(Path clustersIn) { List<Cluster> clusters = new ArrayList<Cluster>(); Configuration conf = new Configuration(); for (Cluster value : new SequenceFileDirValueIterable<Cluster>(clustersIn, PathType.LIST, PathFilters.logsCRCFilter(), conf)) { log.info("Reading Cluster:{} center:{} numPoints:{} radius:{}", new Object[] { value.getId(), AbstractCluster.formatVector(value.getCenter(), null), value.getNumPoints(), AbstractCluster.formatVector(value.getRadius(), null) }); clusters.add(value);/*from w w w .j a v a2 s . com*/ } return clusters; }
From source file:net.aprendizajengrande.ontocluster.Clusterer.java
License:Open Source License
public static void main(String[] args) throws ClassNotFoundException, IOException, InterruptedException { if (args.length != 3) { System.err.println(/*from w w w.ja v a2 s . co m*/ "Usage: <input hdfs folder with vectors> <hdfs folder for output> <local folder for output>"); System.exit(1); } Configuration conf = new Configuration(); DistanceMeasure measure = new CosineDistanceMeasure(); long seed = 67241; int numClusters = 250; int numIterations = 500; // see // http://stackoverflow.com/questions/17265002/hadoop-no-filesystem-for-scheme-file conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); // crear vectores en HDFS System.out.println("Input: " + args[0]); Path input = new Path(args[0] + "/input"); // first centroids are an input parameter to clustering Path clusters = new Path(args[0] + "/clusters"); clusters = RandomSeedGenerator.buildRandom(conf, input, clusters, numClusters, measure, seed); Path output = new Path(args[1]); // cluster KMeansDriver.run(input, clusters, output, 0.005, numIterations, true, 0.0, false); // read the rel names, to pretty print Path inputRels = new Path(args[0] + "/rels"); FileSystem fs = inputRels.getFileSystem(conf); FSDataInputStream fsdis = fs.open(inputRels); BufferedReader br = new BufferedReader(new InputStreamReader(fsdis)); String line = br.readLine(); Map<Integer, String> relIdToName = new HashMap<>(); while (line != null) { String[] parts = line.split("\\t"); relIdToName.put(Integer.parseInt(parts[0]), parts[1]); line = br.readLine(); } // read output Path outputFinal = ClusterExtractor.findFinalClusters(args[1], conf); if (outputFinal == null) { System.err.println("Couldn't find final clusters at '" + args[1] + "-\\d+-final'"); System.exit(1); } Path successFile = new Path(outputFinal, "_SUCCESS"); if (fs.exists(successFile)) { fs.delete(successFile, false); } SequenceFileDirIterable<Text, Writable> it = new SequenceFileDirIterable<>(outputFinal, PathType.LIST, conf); PrintWriter pw = new PrintWriter(new FileWriter(new File(args[2]))); int clusterNum = 0; for (Pair<Text, Writable> p : it) { Object obj = p.getSecond(); if (!(obj instanceof ClusterWritable)) continue; pw.println(clusterNum + ") " + p.getFirst()); Cluster cluster = ((ClusterWritable) obj).getValue(); Vector center = cluster.getCenter(); for (int i = 0; i < center.size(); i++) { String name = relIdToName.get(i); if (name == null) name = "?"; if (center.get(i) >= 0.01) pw.println("\t" + name + ": " + center.get(i)); } pw.println(); clusterNum++; } pw.close(); }
From source file:net.aprendizajengrande.ontocluster.ClusterExtractor.java
License:Open Source License
public static void main(String[] args) throws ClassNotFoundException, IOException, InterruptedException { if (args.length != 3) { System.err.println(//from w w w . ja v a 2 s . c o m "Usage: <input hdfs folder with rels> <hdfs folder for output> <local folder for output>"); System.exit(1); } Configuration conf = new Configuration(); // see // http://stackoverflow.com/questions/17265002/hadoop-no-filesystem-for-scheme-file conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); // crear vectores en HDFS System.out.println("Input: " + args[0]); // read the rel names, to pretty print Path inputRels = new Path(args[0] + "/rels"); FileSystem fs = inputRels.getFileSystem(conf); FSDataInputStream fsdis = fs.open(inputRels); BufferedReader br = new BufferedReader(new InputStreamReader(fsdis)); String line = br.readLine(); Map<Integer, String> relIdToName = new HashMap<>(); while (line != null) { String[] parts = line.split("\\t"); relIdToName.put(Integer.parseInt(parts[0]), parts[1]); line = br.readLine(); } // read output Path outputFinal = findFinalClusters(args[1], conf); if (outputFinal == null) { System.err.println("Couldn't find final clusters at '" + args[1] + "-\\d+-final'"); System.exit(1); } // delete the _SUCCESS file as it is problematic // see // http://stackoverflow.com/questions/10752708/eofexception-at-org-apache-hadoop-io-sequencefilereader-initsequencefile-java Path successFile = new Path(outputFinal, "_SUCCESS"); if (fs.exists(successFile)) { fs.delete(successFile, false); } SequenceFileDirIterable<Text, Writable> it = new SequenceFileDirIterable<>(outputFinal, PathType.LIST, conf); PrintWriter pw = new PrintWriter(new FileWriter(new File(args[2]))); int clusterNum = 0; for (Pair<Text, Writable> p : it) { Object obj = p.getSecond(); if (!(obj instanceof ClusterWritable)) continue; pw.println(clusterNum + ") " + p.getFirst()); Cluster cluster = ((ClusterWritable) obj).getValue(); Vector center = cluster.getCenter(); for (int i = 0; i < center.size(); i++) { String name = relIdToName.get(i); if (name == null) name = "?"; if (center.get(i) >= 0.01) pw.println("\t" + name + ": " + center.get(i)); } pw.println(); clusterNum++; } pw.close(); }
From source file:org.conan.mymahout.clustering.display.DisplayCanopy.java
License:Apache License
protected static void plotClusters(Graphics2D g2) { int cx = CLUSTERS.size() - 1; for (List<Cluster> clusters : CLUSTERS) { for (Cluster cluster : clusters) { if (isSignificant(cluster)) { g2.setStroke(new BasicStroke(1)); g2.setColor(Color.BLUE); double[] t1 = { T1, T1 }; plotEllipse(g2, cluster.getCenter(), new DenseVector(t1)); double[] t2 = { T2, T2 }; plotEllipse(g2, cluster.getCenter(), new DenseVector(t2)); g2.setColor(COLORS[Math.min(DisplayClustering.COLORS.length - 1, cx)]); g2.setStroke(new BasicStroke(cx == 0 ? 3 : 1)); plotEllipse(g2, cluster.getCenter(), cluster.getRadius().times(3)); }/*from w w w. j a va2s . c o m*/ } cx--; } }