Example usage for org.apache.mahout.common.distance CosineDistanceMeasure CosineDistanceMeasure

List of usage examples for org.apache.mahout.common.distance CosineDistanceMeasure CosineDistanceMeasure

Introduction

In this page you can find the example usage for org.apache.mahout.common.distance CosineDistanceMeasure CosineDistanceMeasure.

Prototype

CosineDistanceMeasure

Source Link

Usage

From source file:cc.recommenders.mining.calls.DistanceMeasureFactory.java

License:Open Source License

public DistanceMeasure get() {
    switch (options.getDistanceMeasure()) {
    case COSINE://w  ww .j a  v  a  2s  . co  m
        return new CosineDistanceMeasure();
    case MANHATTAN:
        return new ManhattanDistanceMeasure();
    }
    throw new RuntimeException("unknown distance measure");
}

From source file:com.grantingersoll.intell.clustering.KMeansClusteringEngine.java

License:Apache License

public KMeansClusteringEngine() {
    execService = Executors.newSingleThreadExecutor();
    measure = new CosineDistanceMeasure();
}

From source file:com.gsvic.csmr.CosineSimilarityReducer.java

License:Apache License

@Override
public void reduce(Text key, Iterable<VectorArrayWritable> value, Context context)
        throws IOException, InterruptedException {

    CosineDistanceMeasure cdm = new CosineDistanceMeasure();
    VectorWritable docX, docY;/*from w w  w .j a  v  a2  s  . co  m*/
    double cosine;
    for (VectorArrayWritable v : value) {
        docX = (VectorWritable) v.get()[0];
        docY = (VectorWritable) v.get()[1];
        cosine = cdm.distance(docX.get(), docY.get());
        context.write(key, new DoubleWritable(cosine));
    }

}

From source file:crawler.CETReCrawler.java

License:Apache License

private String[] distFilter() {
     DataSearch keywordsFilter = new DataSearch(TFFreq.keySet().toArray(new String[TFFreq.size()])); //pass potential keywords
     keywordsFilter.collectNewTweets();/*from   w  ww  . java2  s  .  c o m*/
     keywordsFilter.calculateTFIDF();

     ArrayList<String> keywordsFreqName = new ArrayList<String>();//
     ArrayList<Vector> keywordsFreq = new ArrayList<Vector>(); //list of keywords vector
     ArrayList<String> keywords = new ArrayList<String>(); //list of keywords

     for (String basehash : Settings.baseKeywords) {
         keywordsFreqName.add(basehash);//
         keywordsFreq.add(keywordsFilter.getSeedVect(basehash));
         keywords.add(basehash);
     }

     int count = 0;
     while (!keywordsFreq.isEmpty()) {
         String seq1Name = keywordsFreqName.get(0);//
         Vector seq1 = keywordsFreq.get(0);
         Vector seq2;
         Iterator<Entry<String, Integer>> iter = TFFreq.entrySet().iterator();
         while (iter.hasNext()) {
             Entry<String, Integer> ent = iter.next();
             String hashtag = ent.getKey();
             seq2 = keywordsFilter.getSeedVect(hashtag);

             //System.out.print("seq 1 ["+seq1Name+"] count ["+TFFreq.get(seq1Name)+"]: ");//
             //System.out.println(seq1.asFormatString());
             //System.out.print("seq 2 ["+hashtag+"] count ["+TFFreq.get(hashtag)+"]: ");
             if (seq1 != null && seq2 != null) {
                 //System.out.println(seq2.asFormatString());
                 CosineDistanceMeasure cos = new CosineDistanceMeasure();
                 double distVal = cos.distance(seq1, seq2);
                 //System.out.println("***********["+hashtag+"] v.s ["+seq1Name+"]: "+distVal+"***********");

                 //check whether one of the two words is baseline criteria
                 boolean inBase = false;
                 for (String basehash : Settings.baseKeywords) {
                     if (hashtag.replace("#", "").equals(basehash.toLowerCase()) || hashtag.equals(basehash)
                             || keywords.get(count).replace("#", "").equals(basehash.toLowerCase())
                             || keywords.get(count).equals(basehash)) {
                         inBase = true;
                         break;
                     }
                 }

                 //base 0.8, others 0.5
                 if (inBase) {
                     if (distVal < 0.8 && distVal > 0.00001 && !keywords.contains(hashtag)) {
                         keywordsFreqName.add(hashtag);//
                         keywordsFreq.add(seq2);
                         keywords.add(hashtag);
                         System.out.println("BASEL***********[" + hashtag + "] v.s [" + seq1Name + "]: "
                                 + distVal + "***********");
                     }
                 } else {
                     if (distVal < 0.5 && distVal > 0.00001 && !keywords.contains(hashtag)) {
                         keywordsFreqName.add(hashtag);//
                         keywordsFreq.add(seq2);
                         keywords.add(hashtag);
                         System.out.println("OTHER***********[" + hashtag + "] v.s [" + seq1Name + "]: "
                                 + distVal + "***********");
                     }
                 }
             }
         }
         keywordsFreqName.remove(0);//
         keywordsFreq.remove(0);
         count++;
     }

     return keywords.toArray(new String[keywords.size()]);
 }

From source file:io.ssc.relationdiscovery.Runner.java

License:Open Source License

public static void main(String[] args) throws IOException {

    File labelsFile = new File("/home/ssc/Desktop/alan/R30 Tupel/feature_dict/part-r-00000");
    File occurrencesFile = new File("/home/ssc/Desktop/alan/R30 Tupel/tupleids/occurrences.tsv");

    // number of entity pairs in the data
    int numEntityPairs = 7853;
    // number of patterns in the data
    int numPatterns = 58702;

    // desired rank for dimension reduction
    int rank = 25;

    // distance measure for clustering
    DistanceMeasure distanceMeasure = new CosineDistanceMeasure();

    // number of clusters (k of k-Means)
    int numClusters = 10;
    // maximum number of iterations to run
    int maxIterations = 100;
    // number of points to print per cluster
    int numClosestPointsPerCluster = 20;

    long start = System.currentTimeMillis();

    OpenIntObjectHashMap<String> labels = Utils.loadLabels(labelsFile);

    Matrix A = Utils.loadOccurrences(occurrencesFile, numPatterns, numEntityPairs);

    SVD svd = new SVD(A, rank);
    svd.compute();//from  w  w w. j  a  v a  2 s. com
    Matrix P = svd.projectRowsOntoFeatureSpace();

    KMeans kMeans = new KMeans(P, numClusters, distanceMeasure);

    kMeans.run(maxIterations);

    for (int n = 0; n < numClusters; n++) {
        System.out.println("-----" + n + "------");
        kMeans.printClosestPoints(n, numClosestPointsPerCluster, labels);
        System.out.println("\n");
    }

    System.out.println("Computation took " + (System.currentTimeMillis() - start) + "ms");
}

From source file:net.aprendizajengrande.ontocluster.Clusterer.java

License:Open Source License

public static void main(String[] args) throws ClassNotFoundException, IOException, InterruptedException {

    if (args.length != 3) {
        System.err.println(//from www.  j a  va  2s . c o  m
                "Usage: <input hdfs folder with vectors> <hdfs folder for output> <local folder for output>");
        System.exit(1);
    }

    Configuration conf = new Configuration();
    DistanceMeasure measure = new CosineDistanceMeasure();
    long seed = 67241;
    int numClusters = 250;
    int numIterations = 500;

    // see
    // http://stackoverflow.com/questions/17265002/hadoop-no-filesystem-for-scheme-file
    conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
    conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());

    // crear vectores en HDFS
    System.out.println("Input: " + args[0]);
    Path input = new Path(args[0] + "/input");

    // first centroids are an input parameter to clustering
    Path clusters = new Path(args[0] + "/clusters");
    clusters = RandomSeedGenerator.buildRandom(conf, input, clusters, numClusters, measure, seed);

    Path output = new Path(args[1]);

    // cluster
    KMeansDriver.run(input, clusters, output, 0.005, numIterations, true, 0.0, false);

    // read the rel names, to pretty print

    Path inputRels = new Path(args[0] + "/rels");
    FileSystem fs = inputRels.getFileSystem(conf);
    FSDataInputStream fsdis = fs.open(inputRels);
    BufferedReader br = new BufferedReader(new InputStreamReader(fsdis));
    String line = br.readLine();
    Map<Integer, String> relIdToName = new HashMap<>();
    while (line != null) {
        String[] parts = line.split("\\t");
        relIdToName.put(Integer.parseInt(parts[0]), parts[1]);
        line = br.readLine();
    }

    // read output
    Path outputFinal = ClusterExtractor.findFinalClusters(args[1], conf);
    if (outputFinal == null) {
        System.err.println("Couldn't find final clusters at '" + args[1] + "-\\d+-final'");
        System.exit(1);
    }
    Path successFile = new Path(outputFinal, "_SUCCESS");
    if (fs.exists(successFile)) {
        fs.delete(successFile, false);
    }

    SequenceFileDirIterable<Text, Writable> it = new SequenceFileDirIterable<>(outputFinal, PathType.LIST,
            conf);

    PrintWriter pw = new PrintWriter(new FileWriter(new File(args[2])));

    int clusterNum = 0;
    for (Pair<Text, Writable> p : it) {
        Object obj = p.getSecond();
        if (!(obj instanceof ClusterWritable))
            continue;
        pw.println(clusterNum + ") " + p.getFirst());
        Cluster cluster = ((ClusterWritable) obj).getValue();
        Vector center = cluster.getCenter();
        for (int i = 0; i < center.size(); i++) {
            String name = relIdToName.get(i);
            if (name == null)
                name = "?";
            if (center.get(i) >= 0.01)
                pw.println("\t" + name + ": " + center.get(i));
        }
        pw.println();
        clusterNum++;
    }
    pw.close();
}

From source file:org.sleuthkit.hadoop.ClusterDocumentsJob.java

License:Open Source License

/**
 * Runs the clutering algorithms on the tfidf vectors that have been placed in
 * sequence files in directory 'input', and puts raw cluster/json data in
 * 'output'. Also puts json reporting data in the reports/data folder.
 * @param input The sequence files to cluster on.
 * @param output The output directory for raw canopy/kmeans cluster data.
 * @param dictionary The dictionary vector which maps the indices of the vectors
 * to words.// w  ww. ja  v  a 2  s  .c o m
 * @param t1 The t1 value for canopy clustering. The distance measure for
 * canopy is CosineDistanceMeasure, so this should be a value between 0 and 1.
 * @param t2 The t2 value for canopy clustering. Again, should be between
 * t1 and 1. A smaller distance beween the two results in more clusters;
 * a greater distance results in fewer.
 * @param imageID The hash of the image.
 * @param friendlyName The friendly, user given name of the image.
 * @param baseDir The base directory where output data for this image
 * is stored. Used to place the reporting data in the correct location.
 * @return A status code; will be non-zero if the task failed.
 */
public static int runPipeline(String input, String output, String dictionary, double t1, double t2,
        String imageID, String friendlyName, String baseDir) {
    Configuration conf = new Configuration();
    conf.set("mapred.child.java.opts", "-Xmx4096m");
    Path canopyInputPath = new Path(input);
    Path canopyOutputPath = new Path(output + "/canopy");

    Path kmeansInputPath = new Path(input);
    Path kmeansOutputPath = new Path(output + "/kmeans");
    // Canopy (I'm quite certain) only does one pass, so the relevant
    // clusters should be found in this file. For KMeans, this may not
    // be the case. Note, though, that the final clusters with document
    // vectors will be in a different file.
    Path kmeansClusters = new Path(output + "/canopy/clusters-0");

    try {
        CanopyDriver.run(conf, canopyInputPath, canopyOutputPath, new CosineDistanceMeasure(), t1, t2, true,
                false);
    } catch (Exception e) {
        LOG.error("Failure running mahout canopy.", e);
        return 1;
    }

    // The convergencedelta and maxiterations affect how long kmeans will
    // take to run and how many times we run the algorithm before we give
    // up. The numbers we are using here seem to give reasonably good
    // results.
    try {
        KMeansDriver.run(conf, kmeansInputPath, kmeansClusters, kmeansOutputPath, new CosineDistanceMeasure(),
                .5, 20, true, false);
    } catch (Exception e) {
        LOG.error("Failure running mahout kmeans.", e);
        return 2;
    }

    try {
        ////////////////////////////////
        // Output top cluster matches //
        ////////////////////////////////
        Job job = SKJobFactory.createJob(imageID, friendlyName, JobNames.OUTPUT_CLUSTER_MATCH);
        job.setJarByClass(TopFeatureMapper.class);

        // Get the final kmeans iteration. This is sort of a pain but for
        // whatever reason hadoop has no mechanism to do this for us.
        FileSystem fs = FileSystem.get(job.getConfiguration());
        int i = 2;
        Path goodPath = new Path(output + "/kmeans/clusters-1");

        while (true) {
            Path testPath = new Path(output + "/kmeans/clusters-" + i);
            if (!fs.exists(testPath)) {
                break;
            }
            i++;
            goodPath = testPath;
        }

        FileInputFormat.setInputPaths(job, goodPath);
        FileOutputFormat.setOutputPath(job, new Path(output + "/kmeans/topClusters/"));

        job.setMapperClass(TopFeatureMapper.class);
        job.setMapOutputKeyClass(NullWritable.class);
        job.setMapOutputValueClass(Text.class);
        // We need to reduce serially.
        job.setNumReduceTasks(1);

        job.setReducerClass(JSONArrayReducer.class);
        job.setOutputKeyClass(NullWritable.class);
        job.setOutputValueClass(Text.class);

        job.setInputFormatClass(SequenceFileInputFormat.class);
        job.setOutputFormatClass(TextOutputFormat.class);

        job.getConfiguration().set("org.sleuthkit.hadoop.dictionary", dictionary);

        job.waitForCompletion(true);

        ////////////////////////////////
        // Output Cluster->DocID JSON //
        ////////////////////////////////

        job = SKJobFactory.createJob(imageID, friendlyName, JobNames.OUTPUT_CLUSTER_JSON);
        job.setJarByClass(JSONClusterNameMapper.class);

        FileInputFormat.setInputPaths(job, new Path(output + "/kmeans/clusteredPoints/"));
        FileOutputFormat.setOutputPath(job, new Path(output + "/kmeans/jsonClusteredPoints/"));

        job.setMapperClass(JSONClusterNameMapper.class);
        job.setMapOutputKeyClass(NullWritable.class);
        job.setMapOutputValueClass(Text.class);
        // again, we need to reduce serially. We are crafting a single json object and so we must
        // have exactly one output file.
        job.setNumReduceTasks(1);
        job.setReducerClass(JSONArrayReducer.class);
        job.setOutputKeyClass(NullWritable.class);
        job.setOutputValueClass(Text.class);

        job.setInputFormatClass(SequenceFileInputFormat.class);
        job.setOutputFormatClass(TextOutputFormat.class);

        job.waitForCompletion(true);

        // Note that, since we limit the number of reduce tasks to 1, there should only be
        // one reduce 'part'.

        ClusterJSONBuilder.buildReport(new Path(output + "/kmeans/topClusters/part-r-00000"),
                new Path(output + "/kmeans/jsonClusteredPoints/part-r-00000"),
                new Path(baseDir + "/reports/data/documents.js"));
        return 0;
    } catch (IOException ex) {
        LOG.error("Failure while performing HDFS file IO.", ex);
    } catch (ClassNotFoundException ex) {
        LOG.error("Error running job; class not found.", ex);
    } catch (InterruptedException ex) {
        LOG.error("Hadoop job interrupted.", ex);
    }
    // we have failed; return non-zero error code.
    return 3;

}