List of usage examples for org.apache.mahout.common.distance EuclideanDistanceMeasure EuclideanDistanceMeasure
EuclideanDistanceMeasure
From source file:chapter5.KMeanSample.java
License:Apache License
public static void main(String[] args) throws Exception { Path output = new Path("output"); Configuration conf = new Configuration(); HadoopUtil.delete(conf, output);//w w w .j a va 2s . co m run(conf, new Path("testdata"), output, new EuclideanDistanceMeasure(), 6, 0.5, 10); }
From source file:cn.macthink.hadoop.tdt.clustering.canopy.CanopyClustering.java
License:Apache License
public static void main(String[] args) throws Exception { if (args.length > 0) { log.info("Running with only user-supplied arguments"); ToolRunner.run(new Configuration(), new CanopyClustering(), args); } else {/* www.j a v a2s. c om*/ log.info("Running with default arguments"); Path output = new Path("output"); HadoopUtil.delete(new Configuration(), output); run(new Path("testdata"), output, new EuclideanDistanceMeasure(), 80, 55); } }
From source file:com.eniyitavsiye.mahoutx.hadoop.Job.java
License:Apache License
public static void main(String[] args) throws Exception { if (args.length > 0) { log.info("Running with only user-supplied arguments"); ToolRunner.run(new Configuration(), new Job(), args); } else {/*from w w w. jav a2 s .co m*/ log.info("Running with default arguments"); Path output = new Path("output"); Configuration conf = new Configuration(); HadoopUtil.delete(conf, output); run(conf, new Path("testdata"), output, new EuclideanDistanceMeasure(), 6, 0.5, 10); } }
From source file:com.nm.documentClustering.example.KMeansJob.java
License:Apache License
public static void main(String[] args) throws Exception { if (args.length > 0) { log.info("Running with only user-supplied arguments"); ToolRunner.run(new Configuration(), new KMeansJob(), args); } else {//from ww w .jav a 2 s .c om log.info("Running with default arguments"); Path output = new Path("output"); Configuration conf = new Configuration(); HadoopUtil.delete(conf, output); run(conf, new Path("testdata"), output, new EuclideanDistanceMeasure(), 6, 0.5, 10); } }
From source file:com.queirozf.clustering.MahoutKMeans.java
License:Apache License
@Override public int run(String[] args) throws Exception { Path in = new Path(args[0]); Path out = new Path(args[1]); int k = Integer.parseInt(args[2]); double epsilon = 0.001; int maxIterations = 10000; Configuration conf = this.getConf(); DistanceMeasure measure = new EuclideanDistanceMeasure(); Path centroids = RandomSeedGenerator.buildRandom(conf, in, new Path(out, "data/clusters"), k, measure); KMeansDriver.run(conf, in, centroids, out, epsilon, maxIterations, true, 0.0, false); return 0;// w ww . j a v a2 s . c o m }
From source file:edu.dfci.cccb.mev.kmeans.domain.hadoop.HadoopKMeansBuilder.java
License:Open Source License
@Override public KMeans build() throws DatasetException { try (TemporaryFolder hadoop = new TemporaryFolder()) { File points = new File(hadoop, "points"); points.mkdir();//from w ww. j a va 2 s. c o m Configuration configuration = new Configuration(); FileSystem system = get(configuration); final Dimension other = dataset().dimension(dimension().type() == ROW ? COLUMN : ROW); List<NamedVector> vectors = new AbstractList<NamedVector>() { @Override public NamedVector get(int index) { final String vector = dimension().keys().get(index); return new NamedVector(new AbstractVector(other.keys().size()) { @Override public void setQuick(int index, double value) { throw new UnsupportedOperationException(); } @Override public Vector like() { return new RandomAccessSparseVector(size()); } @Override public Iterator<Element> iterator() { return new Iterator<Element>() { private int current = 0; @Override public boolean hasNext() { return current < other.keys().size(); } @Override public Element next() { return new Element() { private final int index = current++; @Override public void set(double value) { throw new UnsupportedOperationException(); } @Override public int index() { return index; } @Override @SneakyThrows(InvalidCoordinateException.class) public double get() { return dimension().type() == ROW ? dataset().values().get(vector, other.keys().get(index)) : dataset().values().get(other.keys().get(index), vector); } }; } @Override public void remove() { throw new UnsupportedOperationException(); } }; } @Override public Iterator<Element> iterateNonZero() { return iterator(); } @Override public boolean isSequentialAccess() { return true; } @Override public boolean isDense() { return true; } @Override @SneakyThrows(InvalidCoordinateException.class) public double getQuick(int index) { return dimension().type() == ROW ? dataset().values().get(vector, other.keys().get(index)) : dataset().values().get(other.keys().get(index), vector); } @Override public int getNumNondefaultElements() { return other.keys().size(); } @Override protected Matrix matrixLike(int rows, int columns) { throw new UnsupportedOperationException(); } }, vector); } @Override public int size() { return dimension().keys().size(); } }; // write input try (Writer writer = new Writer(system, configuration, new Path(new File(points, "file1").getAbsolutePath()), LongWritable.class, VectorWritable.class)) { VectorWritable writable = new VectorWritable(); long record = 0; for (Vector vector : vectors) { writable.set(vector); writer.append(new LongWritable(record++), writable); } } // prepare clusters File clusters = new File(hadoop, "clusters"); clusters.mkdir(); try (Writer writer = new Writer(system, configuration, new Path(new File(clusters, "part-00000").getAbsolutePath()), Text.class, Cluster.class)) { for (int i = 0; i < k(); i++) { Vector vec = vectors.get(i); Cluster cluster = new Cluster(vec, i, new EuclideanDistanceMeasure()); writer.append(new Text(cluster.getIdentifier()), cluster); } } File output = new File(hadoop, "output"); output.mkdir(); try { run(configuration, new Path(points.getAbsolutePath()), new Path(clusters.getAbsolutePath()), new Path(output.getAbsolutePath()), metric.measurer(), convergence(), iterations(), true, false); try (Reader reader = new Reader(system, new Path( new File(new File(output, CLUSTERED_POINTS_DIR), "/part-m-00000").getAbsolutePath()), configuration)) { IntWritable key = new IntWritable(); WeightedVectorWritable value = new WeightedVectorWritable(); Map<String, Set<String>> result = new HashMap<>(); while (reader.next(key, value)) { Set<String> cluster = result.get(key.toString()); if (cluster == null) result.put(key.toString(), cluster = new HashSet<>()); cluster.add(((NamedVector) value.getVector()).getName()); } return new AbstractKMeans() { }.dataset(dataset()).dimension(dimension()).name(name()).type(type()) .clusters(new HashSet<>(result.values())); } } catch (ClassNotFoundException | InterruptedException e) { throw new DatasetException(e); } } catch (IOException e) { throw new DatasetException(e); } }
From source file:hack.VectorScan.java
License:Apache License
private static SimplexSpace<String>[] makeSpaces(int start, int n, boolean doCount) { // Hasher hasher = new OrthonormalHasher(DIMS, 0.001d); lsh.mahout.core.Hasher hasher = new lsh.mahout.core.VertexTransitiveHasher(DIMS, 0.001d); DistanceMeasure measure = new EuclideanDistanceMeasure(); SimplexSpace<String>[] spaces = new SimplexSpace[n]; for (int i = start; i < n; i++) { SimplexSpace<String> space = new SimplexSpace<String>(hasher, DIMS, measure, false, doCount); spaces[i] = space;/*from w ww .j a v a2 s.c o m*/ space.setLOD(i); } return spaces; }
From source file:hk.newsRecommender.MatrixAndCluster.java
License:Open Source License
@SuppressWarnings("deprecation") public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String hdfsUrl = conf.get("fs.defaultFS"); // part1--------------------------------------------------------------- // Job job0 = Job.getInstance(conf, "siftKeywordsDimension"); // Path output1Path=new Path(hdfsUrl + "/data/recommend/matrix1"); // HadoopUtil.delete(conf, output1Path); // job0.setJarByClass(TFIDF.class); // job0.setMapperClass(Mapper_Part1.class); // job0.setReducerClass(Reduce_Part1.class); // job0.setMapOutputKeyClass(Text.class); // job0.setMapOutputValueClass(Text.class); // job0.setOutputKeyClass(Text.class); // job0.setOutputValueClass(Text.class); // job0.setPartitionerClass(CustomPartitioner.class); // FileInputFormat.addInputPath(job0, new Path(hdfsUrl + "/data/recommend/tfidf3")); // FileOutputFormat.setOutputPath(job0, output1Path); // job0.waitForCompletion(true); // part2--------------------------------------------------------------- // FileSystem fsopen = FileSystem.get(conf); // FSDataInputStream in = fsopen.open(new Path(hdfsUrl + "/data/recommend/matrix1/part-r-00000")); // Scanner scan = new Scanner(in); // List<String> keywordList=new ArrayList<String>(); // while (scan.hasNext()) { // keywordList.add(scan.next()); // }//from www. j ava 2 s . c o m //// must before job // conf.setStrings("keyword", keywordList.toArray(new String[keywordList.size()])); // Job job1 = Job.getInstance(conf, "generateMatrix"); // Path output2Path=new Path(hdfsUrl + "/data/recommend/matrix2"); // HadoopUtil.delete(conf, output2Path); // job1.setJarByClass(TFIDF.class); // job1.setMapperClass(Mapper_Part2.class); // job1.setReducerClass(Reduce_Part2.class); // job1.setMapOutputKeyClass(Text.class); // job1.setMapOutputValueClass(Text.class); // job1.setOutputKeyClass(Text.class); // job1.setOutputValueClass(NullWritable.class); //// job1.addCacheFile(new Path("/data/recommend/matrix1/part-r-00000").toUri()); // FileInputFormat.addInputPath(job1, new Path(hdfsUrl + "/data/recommend/tfidf3")); // FileOutputFormat.setOutputPath(job1, output2Path); // job1.waitForCompletion(true); // part3-------------------??-------------------------------------------- Path output3Path = new Path(hdfsUrl + "/data/recommend/cluster2"); HadoopUtil.delete(conf, output3Path); EuclideanDistanceMeasure measure = new EuclideanDistanceMeasure(); Path clusterInput = new Path(hdfsUrl + "/data/recommend/matrix2"); Path clusterSeqInput = new Path(hdfsUrl + "/data/recommend/cluster1"); Path clusterOutput = new Path(hdfsUrl + "/data/recommend/cluster2"); int k = 10; int maxIter = 3; // ?mahout??? // InputDriver.runJob(clusterInput, clusterSeqInput, "org.apache.mahout.math.RandomAccessSparseVector"); // ?k Path clusters = RandomSeedGenerator.buildRandom(conf, clusterSeqInput, new Path(clusterOutput, "clusters-0"), k, measure); KMeansDriver.run(conf, clusterSeqInput, clusters, clusterOutput, 0.01, maxIter, true, 0.0, false); // ClusterDumper printClusters ??? ClusterDumper clusterDumper = new ClusterDumper(new Path(clusterOutput, "clusters-" + (maxIter - 1)), new Path(clusterOutput, "clusteredPoints")); clusterDumper.printClusters(null); clusterOutput(conf, new Path(hdfsUrl + "/data/recommend/cluster2/clusteredPoints/part-m-00000")); // clusterOutput2(conf0,new Path(hdfsUrl0 + "/data/recommend/cluster2/clusteredPoints/part-m-00000")); // matrix2Vector(conf0,new Path(hdfsUrl0 + "/data/recommend/cluster1/part-m-00000"));// }
From source file:org.conan.mymahout.clustering.syntheticcontrol.canopy.Job.java
License:Apache License
public static void main(String[] args) throws Exception { if (args.length > 0) { log.info("Running with only user-supplied arguments"); ToolRunner.run(new Configuration(), new Job(), args); } else {/* w w w .j av a 2 s. c o m*/ log.info("Running with default arguments"); Path output = new Path("output"); HadoopUtil.delete(new Configuration(), output); run(new Path("testdata"), output, new EuclideanDistanceMeasure(), 80, 55); } }
From source file:org.conan.mymahout.clustering.syntheticcontrol.fuzzykmeans.Job.java
License:Apache License
public static void main(String[] args) throws Exception { if (args.length > 0) { log.info("Running with only user-supplied arguments"); ToolRunner.run(new Configuration(), new Job(), args); } else {/*from ww w . j a v a 2 s. co m*/ log.info("Running with default arguments"); Path output = new Path("output"); Configuration conf = new Configuration(); HadoopUtil.delete(conf, output); run(conf, new Path("testdata"), output, new EuclideanDistanceMeasure(), 80, 55, 10, 2.0f, 0.5); } }