Example usage for org.apache.mahout.common.distance EuclideanDistanceMeasure EuclideanDistanceMeasure

List of usage examples for org.apache.mahout.common.distance EuclideanDistanceMeasure EuclideanDistanceMeasure

Introduction

In this page you can find the example usage for org.apache.mahout.common.distance EuclideanDistanceMeasure EuclideanDistanceMeasure.

Prototype

EuclideanDistanceMeasure

Source Link

Usage

From source file:chapter5.KMeanSample.java

License:Apache License

public static void main(String[] args) throws Exception {
    Path output = new Path("output");
    Configuration conf = new Configuration();
    HadoopUtil.delete(conf, output);//w  w  w .j a va  2s  .  co  m
    run(conf, new Path("testdata"), output, new EuclideanDistanceMeasure(), 6, 0.5, 10);
}

From source file:cn.macthink.hadoop.tdt.clustering.canopy.CanopyClustering.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (args.length > 0) {
        log.info("Running with only user-supplied arguments");
        ToolRunner.run(new Configuration(), new CanopyClustering(), args);
    } else {/* www.j a  v a2s. c om*/
        log.info("Running with default arguments");
        Path output = new Path("output");
        HadoopUtil.delete(new Configuration(), output);
        run(new Path("testdata"), output, new EuclideanDistanceMeasure(), 80, 55);
    }
}

From source file:com.eniyitavsiye.mahoutx.hadoop.Job.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (args.length > 0) {
        log.info("Running with only user-supplied arguments");
        ToolRunner.run(new Configuration(), new Job(), args);
    } else {/*from w w w.  jav a2 s .co  m*/
        log.info("Running with default arguments");
        Path output = new Path("output");
        Configuration conf = new Configuration();
        HadoopUtil.delete(conf, output);
        run(conf, new Path("testdata"), output, new EuclideanDistanceMeasure(), 6, 0.5, 10);
    }
}

From source file:com.nm.documentClustering.example.KMeansJob.java

License:Apache License

public static void main(String[] args) throws Exception {

    if (args.length > 0) {
        log.info("Running with only user-supplied arguments");
        ToolRunner.run(new Configuration(), new KMeansJob(), args);
    } else {//from   ww  w  .jav  a  2  s .c om
        log.info("Running with default arguments");
        Path output = new Path("output");
        Configuration conf = new Configuration();
        HadoopUtil.delete(conf, output);
        run(conf, new Path("testdata"), output, new EuclideanDistanceMeasure(), 6, 0.5, 10);
    }
}

From source file:com.queirozf.clustering.MahoutKMeans.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    Path in = new Path(args[0]);
    Path out = new Path(args[1]);
    int k = Integer.parseInt(args[2]);

    double epsilon = 0.001;
    int maxIterations = 10000;

    Configuration conf = this.getConf();

    DistanceMeasure measure = new EuclideanDistanceMeasure();

    Path centroids = RandomSeedGenerator.buildRandom(conf, in, new Path(out, "data/clusters"), k, measure);

    KMeansDriver.run(conf, in, centroids, out, epsilon, maxIterations, true, 0.0, false);

    return 0;//  w ww .  j a v  a2  s  . c  o m
}

From source file:edu.dfci.cccb.mev.kmeans.domain.hadoop.HadoopKMeansBuilder.java

License:Open Source License

@Override
public KMeans build() throws DatasetException {
    try (TemporaryFolder hadoop = new TemporaryFolder()) {
        File points = new File(hadoop, "points");
        points.mkdir();//from   w ww.  j  a  va  2  s. c o  m

        Configuration configuration = new Configuration();
        FileSystem system = get(configuration);
        final Dimension other = dataset().dimension(dimension().type() == ROW ? COLUMN : ROW);

        List<NamedVector> vectors = new AbstractList<NamedVector>() {

            @Override
            public NamedVector get(int index) {
                final String vector = dimension().keys().get(index);
                return new NamedVector(new AbstractVector(other.keys().size()) {

                    @Override
                    public void setQuick(int index, double value) {
                        throw new UnsupportedOperationException();
                    }

                    @Override
                    public Vector like() {
                        return new RandomAccessSparseVector(size());
                    }

                    @Override
                    public Iterator<Element> iterator() {
                        return new Iterator<Element>() {
                            private int current = 0;

                            @Override
                            public boolean hasNext() {
                                return current < other.keys().size();
                            }

                            @Override
                            public Element next() {
                                return new Element() {
                                    private final int index = current++;

                                    @Override
                                    public void set(double value) {
                                        throw new UnsupportedOperationException();
                                    }

                                    @Override
                                    public int index() {
                                        return index;
                                    }

                                    @Override
                                    @SneakyThrows(InvalidCoordinateException.class)
                                    public double get() {
                                        return dimension().type() == ROW
                                                ? dataset().values().get(vector, other.keys().get(index))
                                                : dataset().values().get(other.keys().get(index), vector);
                                    }
                                };
                            }

                            @Override
                            public void remove() {
                                throw new UnsupportedOperationException();
                            }
                        };
                    }

                    @Override
                    public Iterator<Element> iterateNonZero() {
                        return iterator();
                    }

                    @Override
                    public boolean isSequentialAccess() {
                        return true;
                    }

                    @Override
                    public boolean isDense() {
                        return true;
                    }

                    @Override
                    @SneakyThrows(InvalidCoordinateException.class)
                    public double getQuick(int index) {
                        return dimension().type() == ROW
                                ? dataset().values().get(vector, other.keys().get(index))
                                : dataset().values().get(other.keys().get(index), vector);
                    }

                    @Override
                    public int getNumNondefaultElements() {
                        return other.keys().size();
                    }

                    @Override
                    protected Matrix matrixLike(int rows, int columns) {
                        throw new UnsupportedOperationException();
                    }
                }, vector);
            }

            @Override
            public int size() {
                return dimension().keys().size();
            }
        };

        // write input
        try (Writer writer = new Writer(system, configuration,
                new Path(new File(points, "file1").getAbsolutePath()), LongWritable.class,
                VectorWritable.class)) {
            VectorWritable writable = new VectorWritable();
            long record = 0;
            for (Vector vector : vectors) {
                writable.set(vector);
                writer.append(new LongWritable(record++), writable);
            }
        }

        // prepare clusters
        File clusters = new File(hadoop, "clusters");
        clusters.mkdir();
        try (Writer writer = new Writer(system, configuration,
                new Path(new File(clusters, "part-00000").getAbsolutePath()), Text.class, Cluster.class)) {
            for (int i = 0; i < k(); i++) {
                Vector vec = vectors.get(i);
                Cluster cluster = new Cluster(vec, i, new EuclideanDistanceMeasure());
                writer.append(new Text(cluster.getIdentifier()), cluster);
            }
        }

        File output = new File(hadoop, "output");
        output.mkdir();

        try {
            run(configuration, new Path(points.getAbsolutePath()), new Path(clusters.getAbsolutePath()),
                    new Path(output.getAbsolutePath()), metric.measurer(), convergence(), iterations(), true,
                    false);

            try (Reader reader = new Reader(system, new Path(
                    new File(new File(output, CLUSTERED_POINTS_DIR), "/part-m-00000").getAbsolutePath()),
                    configuration)) {
                IntWritable key = new IntWritable();
                WeightedVectorWritable value = new WeightedVectorWritable();
                Map<String, Set<String>> result = new HashMap<>();

                while (reader.next(key, value)) {
                    Set<String> cluster = result.get(key.toString());
                    if (cluster == null)
                        result.put(key.toString(), cluster = new HashSet<>());
                    cluster.add(((NamedVector) value.getVector()).getName());
                }

                return new AbstractKMeans() {
                }.dataset(dataset()).dimension(dimension()).name(name()).type(type())
                        .clusters(new HashSet<>(result.values()));
            }
        } catch (ClassNotFoundException | InterruptedException e) {
            throw new DatasetException(e);
        }
    } catch (IOException e) {
        throw new DatasetException(e);
    }
}

From source file:hack.VectorScan.java

License:Apache License

private static SimplexSpace<String>[] makeSpaces(int start, int n, boolean doCount) {
    //    Hasher hasher = new OrthonormalHasher(DIMS, 0.001d);
    lsh.mahout.core.Hasher hasher = new lsh.mahout.core.VertexTransitiveHasher(DIMS, 0.001d);
    DistanceMeasure measure = new EuclideanDistanceMeasure();
    SimplexSpace<String>[] spaces = new SimplexSpace[n];
    for (int i = start; i < n; i++) {
        SimplexSpace<String> space = new SimplexSpace<String>(hasher, DIMS, measure, false, doCount);
        spaces[i] = space;/*from   w ww .j a v a2 s.c  o  m*/
        space.setLOD(i);
    }
    return spaces;
}

From source file:hk.newsRecommender.MatrixAndCluster.java

License:Open Source License

@SuppressWarnings("deprecation")
public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String hdfsUrl = conf.get("fs.defaultFS");

    //      part1---------------------------------------------------------------
    //      Job job0 = Job.getInstance(conf, "siftKeywordsDimension");
    //      Path output1Path=new Path(hdfsUrl + "/data/recommend/matrix1");
    //      HadoopUtil.delete(conf, output1Path);
    //      job0.setJarByClass(TFIDF.class);
    //      job0.setMapperClass(Mapper_Part1.class);
    //      job0.setReducerClass(Reduce_Part1.class);
    //      job0.setMapOutputKeyClass(Text.class);
    //      job0.setMapOutputValueClass(Text.class);
    //      job0.setOutputKeyClass(Text.class);
    //      job0.setOutputValueClass(Text.class);
    //      job0.setPartitionerClass(CustomPartitioner.class);
    //      FileInputFormat.addInputPath(job0, new Path(hdfsUrl + "/data/recommend/tfidf3"));
    //      FileOutputFormat.setOutputPath(job0, output1Path);
    //      job0.waitForCompletion(true);

    //      part2---------------------------------------------------------------
    //      FileSystem fsopen = FileSystem.get(conf);
    //      FSDataInputStream in = fsopen.open(new Path(hdfsUrl + "/data/recommend/matrix1/part-r-00000"));
    //      Scanner scan = new Scanner(in);
    //      List<String> keywordList=new ArrayList<String>();
    //      while (scan.hasNext()) {
    //         keywordList.add(scan.next());
    //      }//from   www. j  ava 2 s  . c  o  m
    ////      must before job
    //      conf.setStrings("keyword", keywordList.toArray(new String[keywordList.size()]));
    //      Job job1 = Job.getInstance(conf, "generateMatrix");
    //      Path output2Path=new Path(hdfsUrl + "/data/recommend/matrix2");
    //      HadoopUtil.delete(conf, output2Path);
    //      job1.setJarByClass(TFIDF.class);
    //      job1.setMapperClass(Mapper_Part2.class);
    //      job1.setReducerClass(Reduce_Part2.class);
    //      job1.setMapOutputKeyClass(Text.class);
    //      job1.setMapOutputValueClass(Text.class);
    //      job1.setOutputKeyClass(Text.class);
    //      job1.setOutputValueClass(NullWritable.class);
    ////      job1.addCacheFile(new Path("/data/recommend/matrix1/part-r-00000").toUri());
    //      FileInputFormat.addInputPath(job1, new Path(hdfsUrl + "/data/recommend/tfidf3"));
    //      FileOutputFormat.setOutputPath(job1, output2Path);
    //      job1.waitForCompletion(true);

    //      part3-------------------??--------------------------------------------
    Path output3Path = new Path(hdfsUrl + "/data/recommend/cluster2");
    HadoopUtil.delete(conf, output3Path);
    EuclideanDistanceMeasure measure = new EuclideanDistanceMeasure();
    Path clusterInput = new Path(hdfsUrl + "/data/recommend/matrix2");
    Path clusterSeqInput = new Path(hdfsUrl + "/data/recommend/cluster1");
    Path clusterOutput = new Path(hdfsUrl + "/data/recommend/cluster2");
    int k = 10;
    int maxIter = 3;
    //      ?mahout???
    //      InputDriver.runJob(clusterInput, clusterSeqInput, "org.apache.mahout.math.RandomAccessSparseVector");
    //       ?k
    Path clusters = RandomSeedGenerator.buildRandom(conf, clusterSeqInput,
            new Path(clusterOutput, "clusters-0"), k, measure);
    KMeansDriver.run(conf, clusterSeqInput, clusters, clusterOutput, 0.01, maxIter, true, 0.0, false);
    //  ClusterDumper  printClusters ???
    ClusterDumper clusterDumper = new ClusterDumper(new Path(clusterOutput, "clusters-" + (maxIter - 1)),
            new Path(clusterOutput, "clusteredPoints"));
    clusterDumper.printClusters(null);

    clusterOutput(conf, new Path(hdfsUrl + "/data/recommend/cluster2/clusteredPoints/part-m-00000"));
    //      clusterOutput2(conf0,new Path(hdfsUrl0 + "/data/recommend/cluster2/clusteredPoints/part-m-00000"));
    //      matrix2Vector(conf0,new Path(hdfsUrl0 + "/data/recommend/cluster1/part-m-00000"));//

}

From source file:org.conan.mymahout.clustering.syntheticcontrol.canopy.Job.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (args.length > 0) {
        log.info("Running with only user-supplied arguments");
        ToolRunner.run(new Configuration(), new Job(), args);
    } else {/* w w  w  .j  av a  2  s.  c  o m*/
        log.info("Running with default arguments");
        Path output = new Path("output");
        HadoopUtil.delete(new Configuration(), output);
        run(new Path("testdata"), output, new EuclideanDistanceMeasure(), 80, 55);
    }
}

From source file:org.conan.mymahout.clustering.syntheticcontrol.fuzzykmeans.Job.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (args.length > 0) {
        log.info("Running with only user-supplied arguments");
        ToolRunner.run(new Configuration(), new Job(), args);
    } else {/*from  ww  w  . j  a  v  a  2 s.  co  m*/
        log.info("Running with default arguments");
        Path output = new Path("output");
        Configuration conf = new Configuration();
        HadoopUtil.delete(conf, output);
        run(conf, new Path("testdata"), output, new EuclideanDistanceMeasure(), 80, 55, 10, 2.0f, 0.5);
    }
}