List of usage examples for org.apache.mahout.common.distance CosineDistanceMeasure CosineDistanceMeasure
CosineDistanceMeasure
From source file:cc.recommenders.mining.calls.DistanceMeasureFactory.java
License:Open Source License
public DistanceMeasure get() { switch (options.getDistanceMeasure()) { case COSINE://w ww .j a v a 2s . co m return new CosineDistanceMeasure(); case MANHATTAN: return new ManhattanDistanceMeasure(); } throw new RuntimeException("unknown distance measure"); }
From source file:com.grantingersoll.intell.clustering.KMeansClusteringEngine.java
License:Apache License
public KMeansClusteringEngine() { execService = Executors.newSingleThreadExecutor(); measure = new CosineDistanceMeasure(); }
From source file:com.gsvic.csmr.CosineSimilarityReducer.java
License:Apache License
@Override public void reduce(Text key, Iterable<VectorArrayWritable> value, Context context) throws IOException, InterruptedException { CosineDistanceMeasure cdm = new CosineDistanceMeasure(); VectorWritable docX, docY;/*from w w w .j a v a2 s . co m*/ double cosine; for (VectorArrayWritable v : value) { docX = (VectorWritable) v.get()[0]; docY = (VectorWritable) v.get()[1]; cosine = cdm.distance(docX.get(), docY.get()); context.write(key, new DoubleWritable(cosine)); } }
From source file:crawler.CETReCrawler.java
License:Apache License
private String[] distFilter() { DataSearch keywordsFilter = new DataSearch(TFFreq.keySet().toArray(new String[TFFreq.size()])); //pass potential keywords keywordsFilter.collectNewTweets();/*from w ww . java2 s . c o m*/ keywordsFilter.calculateTFIDF(); ArrayList<String> keywordsFreqName = new ArrayList<String>();// ArrayList<Vector> keywordsFreq = new ArrayList<Vector>(); //list of keywords vector ArrayList<String> keywords = new ArrayList<String>(); //list of keywords for (String basehash : Settings.baseKeywords) { keywordsFreqName.add(basehash);// keywordsFreq.add(keywordsFilter.getSeedVect(basehash)); keywords.add(basehash); } int count = 0; while (!keywordsFreq.isEmpty()) { String seq1Name = keywordsFreqName.get(0);// Vector seq1 = keywordsFreq.get(0); Vector seq2; Iterator<Entry<String, Integer>> iter = TFFreq.entrySet().iterator(); while (iter.hasNext()) { Entry<String, Integer> ent = iter.next(); String hashtag = ent.getKey(); seq2 = keywordsFilter.getSeedVect(hashtag); //System.out.print("seq 1 ["+seq1Name+"] count ["+TFFreq.get(seq1Name)+"]: ");// //System.out.println(seq1.asFormatString()); //System.out.print("seq 2 ["+hashtag+"] count ["+TFFreq.get(hashtag)+"]: "); if (seq1 != null && seq2 != null) { //System.out.println(seq2.asFormatString()); CosineDistanceMeasure cos = new CosineDistanceMeasure(); double distVal = cos.distance(seq1, seq2); //System.out.println("***********["+hashtag+"] v.s ["+seq1Name+"]: "+distVal+"***********"); //check whether one of the two words is baseline criteria boolean inBase = false; for (String basehash : Settings.baseKeywords) { if (hashtag.replace("#", "").equals(basehash.toLowerCase()) || hashtag.equals(basehash) || keywords.get(count).replace("#", "").equals(basehash.toLowerCase()) || keywords.get(count).equals(basehash)) { inBase = true; break; } } //base 0.8, others 0.5 if (inBase) { if (distVal < 0.8 && distVal > 0.00001 && !keywords.contains(hashtag)) { keywordsFreqName.add(hashtag);// keywordsFreq.add(seq2); keywords.add(hashtag); System.out.println("BASEL***********[" + hashtag + "] v.s [" + seq1Name + "]: " + distVal + "***********"); } } else { if (distVal < 0.5 && distVal > 0.00001 && !keywords.contains(hashtag)) { keywordsFreqName.add(hashtag);// keywordsFreq.add(seq2); keywords.add(hashtag); System.out.println("OTHER***********[" + hashtag + "] v.s [" + seq1Name + "]: " + distVal + "***********"); } } } } keywordsFreqName.remove(0);// keywordsFreq.remove(0); count++; } return keywords.toArray(new String[keywords.size()]); }
From source file:io.ssc.relationdiscovery.Runner.java
License:Open Source License
public static void main(String[] args) throws IOException { File labelsFile = new File("/home/ssc/Desktop/alan/R30 Tupel/feature_dict/part-r-00000"); File occurrencesFile = new File("/home/ssc/Desktop/alan/R30 Tupel/tupleids/occurrences.tsv"); // number of entity pairs in the data int numEntityPairs = 7853; // number of patterns in the data int numPatterns = 58702; // desired rank for dimension reduction int rank = 25; // distance measure for clustering DistanceMeasure distanceMeasure = new CosineDistanceMeasure(); // number of clusters (k of k-Means) int numClusters = 10; // maximum number of iterations to run int maxIterations = 100; // number of points to print per cluster int numClosestPointsPerCluster = 20; long start = System.currentTimeMillis(); OpenIntObjectHashMap<String> labels = Utils.loadLabels(labelsFile); Matrix A = Utils.loadOccurrences(occurrencesFile, numPatterns, numEntityPairs); SVD svd = new SVD(A, rank); svd.compute();//from w w w. j a v a 2 s. com Matrix P = svd.projectRowsOntoFeatureSpace(); KMeans kMeans = new KMeans(P, numClusters, distanceMeasure); kMeans.run(maxIterations); for (int n = 0; n < numClusters; n++) { System.out.println("-----" + n + "------"); kMeans.printClosestPoints(n, numClosestPointsPerCluster, labels); System.out.println("\n"); } System.out.println("Computation took " + (System.currentTimeMillis() - start) + "ms"); }
From source file:net.aprendizajengrande.ontocluster.Clusterer.java
License:Open Source License
public static void main(String[] args) throws ClassNotFoundException, IOException, InterruptedException { if (args.length != 3) { System.err.println(//from www. j a va 2s . c o m "Usage: <input hdfs folder with vectors> <hdfs folder for output> <local folder for output>"); System.exit(1); } Configuration conf = new Configuration(); DistanceMeasure measure = new CosineDistanceMeasure(); long seed = 67241; int numClusters = 250; int numIterations = 500; // see // http://stackoverflow.com/questions/17265002/hadoop-no-filesystem-for-scheme-file conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); // crear vectores en HDFS System.out.println("Input: " + args[0]); Path input = new Path(args[0] + "/input"); // first centroids are an input parameter to clustering Path clusters = new Path(args[0] + "/clusters"); clusters = RandomSeedGenerator.buildRandom(conf, input, clusters, numClusters, measure, seed); Path output = new Path(args[1]); // cluster KMeansDriver.run(input, clusters, output, 0.005, numIterations, true, 0.0, false); // read the rel names, to pretty print Path inputRels = new Path(args[0] + "/rels"); FileSystem fs = inputRels.getFileSystem(conf); FSDataInputStream fsdis = fs.open(inputRels); BufferedReader br = new BufferedReader(new InputStreamReader(fsdis)); String line = br.readLine(); Map<Integer, String> relIdToName = new HashMap<>(); while (line != null) { String[] parts = line.split("\\t"); relIdToName.put(Integer.parseInt(parts[0]), parts[1]); line = br.readLine(); } // read output Path outputFinal = ClusterExtractor.findFinalClusters(args[1], conf); if (outputFinal == null) { System.err.println("Couldn't find final clusters at '" + args[1] + "-\\d+-final'"); System.exit(1); } Path successFile = new Path(outputFinal, "_SUCCESS"); if (fs.exists(successFile)) { fs.delete(successFile, false); } SequenceFileDirIterable<Text, Writable> it = new SequenceFileDirIterable<>(outputFinal, PathType.LIST, conf); PrintWriter pw = new PrintWriter(new FileWriter(new File(args[2]))); int clusterNum = 0; for (Pair<Text, Writable> p : it) { Object obj = p.getSecond(); if (!(obj instanceof ClusterWritable)) continue; pw.println(clusterNum + ") " + p.getFirst()); Cluster cluster = ((ClusterWritable) obj).getValue(); Vector center = cluster.getCenter(); for (int i = 0; i < center.size(); i++) { String name = relIdToName.get(i); if (name == null) name = "?"; if (center.get(i) >= 0.01) pw.println("\t" + name + ": " + center.get(i)); } pw.println(); clusterNum++; } pw.close(); }
From source file:org.sleuthkit.hadoop.ClusterDocumentsJob.java
License:Open Source License
/** * Runs the clutering algorithms on the tfidf vectors that have been placed in * sequence files in directory 'input', and puts raw cluster/json data in * 'output'. Also puts json reporting data in the reports/data folder. * @param input The sequence files to cluster on. * @param output The output directory for raw canopy/kmeans cluster data. * @param dictionary The dictionary vector which maps the indices of the vectors * to words.// w ww. ja v a 2 s .c o m * @param t1 The t1 value for canopy clustering. The distance measure for * canopy is CosineDistanceMeasure, so this should be a value between 0 and 1. * @param t2 The t2 value for canopy clustering. Again, should be between * t1 and 1. A smaller distance beween the two results in more clusters; * a greater distance results in fewer. * @param imageID The hash of the image. * @param friendlyName The friendly, user given name of the image. * @param baseDir The base directory where output data for this image * is stored. Used to place the reporting data in the correct location. * @return A status code; will be non-zero if the task failed. */ public static int runPipeline(String input, String output, String dictionary, double t1, double t2, String imageID, String friendlyName, String baseDir) { Configuration conf = new Configuration(); conf.set("mapred.child.java.opts", "-Xmx4096m"); Path canopyInputPath = new Path(input); Path canopyOutputPath = new Path(output + "/canopy"); Path kmeansInputPath = new Path(input); Path kmeansOutputPath = new Path(output + "/kmeans"); // Canopy (I'm quite certain) only does one pass, so the relevant // clusters should be found in this file. For KMeans, this may not // be the case. Note, though, that the final clusters with document // vectors will be in a different file. Path kmeansClusters = new Path(output + "/canopy/clusters-0"); try { CanopyDriver.run(conf, canopyInputPath, canopyOutputPath, new CosineDistanceMeasure(), t1, t2, true, false); } catch (Exception e) { LOG.error("Failure running mahout canopy.", e); return 1; } // The convergencedelta and maxiterations affect how long kmeans will // take to run and how many times we run the algorithm before we give // up. The numbers we are using here seem to give reasonably good // results. try { KMeansDriver.run(conf, kmeansInputPath, kmeansClusters, kmeansOutputPath, new CosineDistanceMeasure(), .5, 20, true, false); } catch (Exception e) { LOG.error("Failure running mahout kmeans.", e); return 2; } try { //////////////////////////////// // Output top cluster matches // //////////////////////////////// Job job = SKJobFactory.createJob(imageID, friendlyName, JobNames.OUTPUT_CLUSTER_MATCH); job.setJarByClass(TopFeatureMapper.class); // Get the final kmeans iteration. This is sort of a pain but for // whatever reason hadoop has no mechanism to do this for us. FileSystem fs = FileSystem.get(job.getConfiguration()); int i = 2; Path goodPath = new Path(output + "/kmeans/clusters-1"); while (true) { Path testPath = new Path(output + "/kmeans/clusters-" + i); if (!fs.exists(testPath)) { break; } i++; goodPath = testPath; } FileInputFormat.setInputPaths(job, goodPath); FileOutputFormat.setOutputPath(job, new Path(output + "/kmeans/topClusters/")); job.setMapperClass(TopFeatureMapper.class); job.setMapOutputKeyClass(NullWritable.class); job.setMapOutputValueClass(Text.class); // We need to reduce serially. job.setNumReduceTasks(1); job.setReducerClass(JSONArrayReducer.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Text.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.getConfiguration().set("org.sleuthkit.hadoop.dictionary", dictionary); job.waitForCompletion(true); //////////////////////////////// // Output Cluster->DocID JSON // //////////////////////////////// job = SKJobFactory.createJob(imageID, friendlyName, JobNames.OUTPUT_CLUSTER_JSON); job.setJarByClass(JSONClusterNameMapper.class); FileInputFormat.setInputPaths(job, new Path(output + "/kmeans/clusteredPoints/")); FileOutputFormat.setOutputPath(job, new Path(output + "/kmeans/jsonClusteredPoints/")); job.setMapperClass(JSONClusterNameMapper.class); job.setMapOutputKeyClass(NullWritable.class); job.setMapOutputValueClass(Text.class); // again, we need to reduce serially. We are crafting a single json object and so we must // have exactly one output file. job.setNumReduceTasks(1); job.setReducerClass(JSONArrayReducer.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Text.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.waitForCompletion(true); // Note that, since we limit the number of reduce tasks to 1, there should only be // one reduce 'part'. ClusterJSONBuilder.buildReport(new Path(output + "/kmeans/topClusters/part-r-00000"), new Path(output + "/kmeans/jsonClusteredPoints/part-r-00000"), new Path(baseDir + "/reports/data/documents.js")); return 0; } catch (IOException ex) { LOG.error("Failure while performing HDFS file IO.", ex); } catch (ClassNotFoundException ex) { LOG.error("Error running job; class not found.", ex); } catch (InterruptedException ex) { LOG.error("Hadoop job interrupted.", ex); } // we have failed; return non-zero error code. return 3; }