Example usage for org.apache.mahout.common.distance CosineDistanceMeasure CosineDistanceMeasure

Introduction

In this page you can find the example usage for org.apache.mahout.common.distance CosineDistanceMeasure CosineDistanceMeasure.

Prototype

CosineDistanceMeasure

Source Link

Usage

From source file:cc.recommenders.mining.calls.DistanceMeasureFactory.java

License:Open Source License

public DistanceMeasure get() {
    switch (options.getDistanceMeasure()) {
    case COSINE://w  ww .j a  v  a  2s  . co  m
        return new CosineDistanceMeasure();
    case MANHATTAN:
        return new ManhattanDistanceMeasure();
    }
    throw new RuntimeException("unknown distance measure");
}

From source file:com.grantingersoll.intell.clustering.KMeansClusteringEngine.java

License:Apache License

public KMeansClusteringEngine() {
    execService = Executors.newSingleThreadExecutor();
    measure = new CosineDistanceMeasure();
}

From source file:com.gsvic.csmr.CosineSimilarityReducer.java

License:Apache License

@Override
public void reduce(Text key, Iterable<VectorArrayWritable> value, Context context)
        throws IOException, InterruptedException {

    CosineDistanceMeasure cdm = new CosineDistanceMeasure();
    VectorWritable docX, docY;/*from w w  w .j a  v  a2  s  . co  m*/
    double cosine;
    for (VectorArrayWritable v : value) {
        docX = (VectorWritable) v.get()[0];
        docY = (VectorWritable) v.get()[1];
        cosine = cdm.distance(docX.get(), docY.get());
        context.write(key, new DoubleWritable(cosine));
    }

}

From source file:crawler.CETReCrawler.java

License:Apache License

private String[] distFilter() {
     DataSearch keywordsFilter = new DataSearch(TFFreq.keySet().toArray(new String[TFFreq.size()])); //pass potential keywords
     keywordsFilter.collectNewTweets();/*from   w  ww  . java2  s  .  c o m*/
     keywordsFilter.calculateTFIDF();

     ArrayList<String> keywordsFreqName = new ArrayList<String>();//
     ArrayList<Vector> keywordsFreq = new ArrayList<Vector>(); //list of keywords vector
     ArrayList<String> keywords = new ArrayList<String>(); //list of keywords

     for (String basehash : Settings.baseKeywords) {
         keywordsFreqName.add(basehash);//
         keywordsFreq.add(keywordsFilter.getSeedVect(basehash));
         keywords.add(basehash);
     }

     int count = 0;
     while (!keywordsFreq.isEmpty()) {
         String seq1Name = keywordsFreqName.get(0);//
         Vector seq1 = keywordsFreq.get(0);
         Vector seq2;
         Iterator<Entry<String, Integer>> iter = TFFreq.entrySet().iterator();
         while (iter.hasNext()) {
             Entry<String, Integer> ent = iter.next();
             String hashtag = ent.getKey();
             seq2 = keywordsFilter.getSeedVect(hashtag);

             //System.out.print("seq 1 ["+seq1Name+"] count ["+TFFreq.get(seq1Name)+"]: ");//
             //System.out.println(seq1.asFormatString());
             //System.out.print("seq 2 ["+hashtag+"] count ["+TFFreq.get(hashtag)+"]: ");
             if (seq1 != null && seq2 != null) {
                 //System.out.println(seq2.asFormatString());
                 CosineDistanceMeasure cos = new CosineDistanceMeasure();
                 double distVal = cos.distance(seq1, seq2);
                 //System.out.println("***********["+hashtag+"] v.s ["+seq1Name+"]: "+distVal+"***********");

                 //check whether one of the two words is baseline criteria
                 boolean inBase = false;
                 for (String basehash : Settings.baseKeywords) {
                     if (hashtag.replace("#", "").equals(basehash.toLowerCase()) || hashtag.equals(basehash)
                             || keywords.get(count).replace("#", "").equals(basehash.toLowerCase())
                             || keywords.get(count).equals(basehash)) {
                         inBase = true;
                         break;
                     }
                 }

                 //base 0.8, others 0.5
                 if (inBase) {
                     if (distVal < 0.8 && distVal > 0.00001 && !keywords.contains(hashtag)) {
                         keywordsFreqName.add(hashtag);//
                         keywordsFreq.add(seq2);
                         keywords.add(hashtag);
                         System.out.println("BASEL***********[" + hashtag + "] v.s [" + seq1Name + "]: "
                                 + distVal + "***********");
                     }
                 } else {
                     if (distVal < 0.5 && distVal > 0.00001 && !keywords.contains(hashtag)) {
                         keywordsFreqName.add(hashtag);//
                         keywordsFreq.add(seq2);
                         keywords.add(hashtag);
                         System.out.println("OTHER***********[" + hashtag + "] v.s [" + seq1Name + "]: "
                                 + distVal + "***********");
                     }
                 }
             }
         }
         keywordsFreqName.remove(0);//
         keywordsFreq.remove(0);
         count++;
     }

     return keywords.toArray(new String[keywords.size()]);
 }

From source file:io.ssc.relationdiscovery.Runner.java

License:Open Source License

public static void main(String[] args) throws IOException {

    File labelsFile = new File("/home/ssc/Desktop/alan/R30 Tupel/feature_dict/part-r-00000");
    File occurrencesFile = new File("/home/ssc/Desktop/alan/R30 Tupel/tupleids/occurrences.tsv");

    // number of entity pairs in the data
    int numEntityPairs = 7853;
    // number of patterns in the data
    int numPatterns = 58702;

    // desired rank for dimension reduction
    int rank = 25;

    // distance measure for clustering
    DistanceMeasure distanceMeasure = new CosineDistanceMeasure();

    // number of clusters (k of k-Means)
    int numClusters = 10;
    // maximum number of iterations to run
    int maxIterations = 100;
    // number of points to print per cluster
    int numClosestPointsPerCluster = 20;

    long start = System.currentTimeMillis();

    OpenIntObjectHashMap<String> labels = Utils.loadLabels(labelsFile);

    Matrix A = Utils.loadOccurrences(occurrencesFile, numPatterns, numEntityPairs);

    SVD svd = new SVD(A, rank);
    svd.compute();//from  w  w w. j  a  v a  2 s. com
    Matrix P = svd.projectRowsOntoFeatureSpace();

    KMeans kMeans = new KMeans(P, numClusters, distanceMeasure);

    kMeans.run(maxIterations);

    for (int n = 0; n < numClusters; n++) {
        System.out.println("-----" + n + "------");
        kMeans.printClosestPoints(n, numClosestPointsPerCluster, labels);
        System.out.println("\n");
    }

    System.out.println("Computation took " + (System.currentTimeMillis() - start) + "ms");
}

From source file:net.aprendizajengrande.ontocluster.Clusterer.java

License:Open Source License

public static void main(String[] args) throws ClassNotFoundException, IOException, InterruptedException {

    if (args.length != 3) {
        System.err.println(//from www.  j a  va  2s . c o  m
                "Usage: <input hdfs folder with vectors> <hdfs folder for output> <local folder for output>");
        System.exit(1);
    }

    Configuration conf = new Configuration();
    DistanceMeasure measure = new CosineDistanceMeasure();
    long seed = 67241;
    int numClusters = 250;
    int numIterations = 500;

    // see
    // http://stackoverflow.com/questions/17265002/hadoop-no-filesystem-for-scheme-file
    conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
    conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());

    // crear vectores en HDFS
    System.out.println("Input: " + args[0]);
    Path input = new Path(args[0] + "/input");

    // first centroids are an input parameter to clustering
    Path clusters = new Path(args[0] + "/clusters");
    clusters = RandomSeedGenerator.buildRandom(conf, input, clusters, numClusters, measure, seed);

    Path output = new Path(args[1]);

    // cluster
    KMeansDriver.run(input, clusters, output, 0.005, numIterations, true, 0.0, false);

    // read the rel names, to pretty print

    Path inputRels = new Path(args[0] + "/rels");
    FileSystem fs = inputRels.getFileSystem(conf);
    FSDataInputStream fsdis = fs.open(inputRels);
    BufferedReader br = new BufferedReader(new InputStreamReader(fsdis));
    String line = br.readLine();
    Map<Integer, String> relIdToName = new HashMap<>();
    while (line != null) {
        String[] parts = line.split("\\t");
        relIdToName.put(Integer.parseInt(parts[0]), parts[1]);
        line = br.readLine();
    }

    // read output
    Path outputFinal = ClusterExtractor.findFinalClusters(args[1], conf);
    if (outputFinal == null) {
        System.err.println("Couldn't find final clusters at '" + args[1] + "-\\d+-final'");
        System.exit(1);
    }
    Path successFile = new Path(outputFinal, "_SUCCESS");
    if (fs.exists(successFile)) {
        fs.delete(successFile, false);
    }

    SequenceFileDirIterable<Text, Writable> it = new SequenceFileDirIterable<>(outputFinal, PathType.LIST,
            conf);

    PrintWriter pw = new PrintWriter(new FileWriter(new File(args[2])));

    int clusterNum = 0;
    for (Pair<Text, Writable> p : it) {
        Object obj = p.getSecond();
        if (!(obj instanceof ClusterWritable))
            continue;
        pw.println(clusterNum + ") " + p.getFirst());
        Cluster cluster = ((ClusterWritable) obj).getValue();
        Vector center = cluster.getCenter();
        for (int i = 0; i < center.size(); i++) {
            String name = relIdToName.get(i);
            if (name == null)
                name = "?";
            if (center.get(i) >= 0.01)
                pw.println("\t" + name + ": " + center.get(i));
        }
        pw.println();
        clusterNum++;
    }
    pw.close();
}

From source file:org.sleuthkit.hadoop.ClusterDocumentsJob.java

License:Open Source License

/**
 * Runs the clutering algorithms on the tfidf vectors that have been placed in
 * sequence files in directory 'input', and puts raw cluster/json data in
 * 'output'. Also puts json reporting data in the reports/data folder.
 * @param input The sequence files to cluster on.
 * @param output The output directory for raw canopy/kmeans cluster data.
 * @param dictionary The dictionary vector which maps the indices of the vectors
 * to words.// w  ww. ja  v  a 2  s  .c o m
 * @param t1 The t1 value for canopy clustering. The distance measure for
 * canopy is CosineDistanceMeasure, so this should be a value between 0 and 1.
 * @param t2 The t2 value for canopy clustering. Again, should be between
 * t1 and 1. A smaller distance beween the two results in more clusters;
 * a greater distance results in fewer.
 * @param imageID The hash of the image.
 * @param friendlyName The friendly, user given name of the image.
 * @param baseDir The base directory where output data for this image
 * is stored. Used to place the reporting data in the correct location.
 * @return A status code; will be non-zero if the task failed.
 */
public static int runPipeline(String input, String output, String dictionary, double t1, double t2,
        String imageID, String friendlyName, String baseDir) {
    Configuration conf = new Configuration();
    conf.set("mapred.child.java.opts", "-Xmx4096m");
    Path canopyInputPath = new Path(input);
    Path canopyOutputPath = new Path(output + "/canopy");

    Path kmeansInputPath = new Path(input);
    Path kmeansOutputPath = new Path(output + "/kmeans");
    // Canopy (I'm quite certain) only does one pass, so the relevant
    // clusters should be found in this file. For KMeans, this may not
    // be the case. Note, though, that the final clusters with document
    // vectors will be in a different file.
    Path kmeansClusters = new Path(output + "/canopy/clusters-0");

    try {
        CanopyDriver.run(conf, canopyInputPath, canopyOutputPath, new CosineDistanceMeasure(), t1, t2, true,
                false);
    } catch (Exception e) {
        LOG.error("Failure running mahout canopy.", e);
        return 1;
    }

    // The convergencedelta and maxiterations affect how long kmeans will
    // take to run and how many times we run the algorithm before we give
    // up. The numbers we are using here seem to give reasonably good
    // results.
    try {
        KMeansDriver.run(conf, kmeansInputPath, kmeansClusters, kmeansOutputPath, new CosineDistanceMeasure(),
                .5, 20, true, false);
    } catch (Exception e) {
        LOG.error("Failure running mahout kmeans.", e);
        return 2;
    }

    try {
        ////////////////////////////////
        // Output top cluster matches //
        ////////////////////////////////
        Job job = SKJobFactory.createJob(imageID, friendlyName, JobNames.OUTPUT_CLUSTER_MATCH);
        job.setJarByClass(TopFeatureMapper.class);

        // Get the final kmeans iteration. This is sort of a pain but for
        // whatever reason hadoop has no mechanism to do this for us.
        FileSystem fs = FileSystem.get(job.getConfiguration());
        int i = 2;
        Path goodPath = new Path(output + "/kmeans/clusters-1");

        while (true) {
            Path testPath = new Path(output + "/kmeans/clusters-" + i);
            if (!fs.exists(testPath)) {
                break;
            }
            i++;
            goodPath = testPath;
        }

        FileInputFormat.setInputPaths(job, goodPath);
        FileOutputFormat.setOutputPath(job, new Path(output + "/kmeans/topClusters/"));

        job.setMapperClass(TopFeatureMapper.class);
        job.setMapOutputKeyClass(NullWritable.class);
        job.setMapOutputValueClass(Text.class);
        // We need to reduce serially.
        job.setNumReduceTasks(1);

        job.setReducerClass(JSONArrayReducer.class);
        job.setOutputKeyClass(NullWritable.class);
        job.setOutputValueClass(Text.class);

        job.setInputFormatClass(SequenceFileInputFormat.class);
        job.setOutputFormatClass(TextOutputFormat.class);

        job.getConfiguration().set("org.sleuthkit.hadoop.dictionary", dictionary);

        job.waitForCompletion(true);

        ////////////////////////////////
        // Output Cluster->DocID JSON //
        ////////////////////////////////

        job = SKJobFactory.createJob(imageID, friendlyName, JobNames.OUTPUT_CLUSTER_JSON);
        job.setJarByClass(JSONClusterNameMapper.class);

        FileInputFormat.setInputPaths(job, new Path(output + "/kmeans/clusteredPoints/"));
        FileOutputFormat.setOutputPath(job, new Path(output + "/kmeans/jsonClusteredPoints/"));

        job.setMapperClass(JSONClusterNameMapper.class);
        job.setMapOutputKeyClass(NullWritable.class);
        job.setMapOutputValueClass(Text.class);
        // again, we need to reduce serially. We are crafting a single json object and so we must
        // have exactly one output file.
        job.setNumReduceTasks(1);
        job.setReducerClass(JSONArrayReducer.class);
        job.setOutputKeyClass(NullWritable.class);
        job.setOutputValueClass(Text.class);

        job.setInputFormatClass(SequenceFileInputFormat.class);
        job.setOutputFormatClass(TextOutputFormat.class);

        job.waitForCompletion(true);

        // Note that, since we limit the number of reduce tasks to 1, there should only be
        // one reduce 'part'.

        ClusterJSONBuilder.buildReport(new Path(output + "/kmeans/topClusters/part-r-00000"),
                new Path(output + "/kmeans/jsonClusteredPoints/part-r-00000"),
                new Path(baseDir + "/reports/data/documents.js"));
        return 0;
    } catch (IOException ex) {
        LOG.error("Failure while performing HDFS file IO.", ex);
    } catch (ClassNotFoundException ex) {
        LOG.error("Error running job; class not found.", ex);
    } catch (InterruptedException ex) {
        LOG.error("Hadoop job interrupted.", ex);
    }
    // we have failed; return non-zero error code.
    return 3;

}