List of usage examples for org.apache.mahout.common.iterator.sequencefile SequenceFileDirValueIterable SequenceFileDirValueIterable
public SequenceFileDirValueIterable(Path path, PathType pathType, Configuration conf)
From source file:org.conan.mymahout.clustering.streaming.tools.ClusterQualitySummarizer.java
License:Apache License
public int run(String[] args) throws IOException { if (!parseArgs(args)) { return -1; }/* w w w . ja va2 s . com*/ Configuration conf = new Configuration(); try { // Configuration.dumpConfiguration(conf, new OutputStreamWriter(System.out)); fileOut = new PrintWriter(new FileOutputStream(outputFile)); fileOut.printf("cluster,distance.mean,distance.sd,distance.q0,distance.q1,distance.q2,distance.q3," + "distance.q4,count,is.train\n"); // Reading in the centroids (both pairs, if they exist). List<Centroid> centroids; List<Centroid> centroidsCompare = null; if (mahoutKMeansFormat) { SequenceFileDirValueIterable<ClusterWritable> clusterIterable = new SequenceFileDirValueIterable<ClusterWritable>( new Path(centroidFile), PathType.GLOB, conf); centroids = Lists.newArrayList(IOUtils.getCentroidsFromClusterWritableIterable(clusterIterable)); } else { SequenceFileDirValueIterable<CentroidWritable> centroidIterable = new SequenceFileDirValueIterable<CentroidWritable>( new Path(centroidFile), PathType.GLOB, conf); centroids = Lists.newArrayList(IOUtils.getCentroidsFromCentroidWritableIterable(centroidIterable)); } if (centroidCompareFile != null) { if (mahoutKMeansFormatCompare) { SequenceFileDirValueIterable<ClusterWritable> clusterCompareIterable = new SequenceFileDirValueIterable<ClusterWritable>( new Path(centroidCompareFile), PathType.GLOB, conf); centroidsCompare = Lists .newArrayList(IOUtils.getCentroidsFromClusterWritableIterable(clusterCompareIterable)); } else { SequenceFileDirValueIterable<CentroidWritable> centroidCompareIterable = new SequenceFileDirValueIterable<CentroidWritable>( new Path(centroidCompareFile), PathType.GLOB, conf); centroidsCompare = Lists.newArrayList( IOUtils.getCentroidsFromCentroidWritableIterable(centroidCompareIterable)); } } // Reading in the "training" set. SequenceFileDirValueIterable<VectorWritable> trainIterable = new SequenceFileDirValueIterable<VectorWritable>( new Path(trainFile), PathType.GLOB, conf); Iterable<Vector> trainDatapoints = IOUtils.getVectorsFromVectorWritableIterable(trainIterable); Iterable<Vector> datapoints = trainDatapoints; printSummaries(ClusteringUtils.summarizeClusterDistances(trainDatapoints, centroids, new SquaredEuclideanDistanceMeasure()), "train"); // Also adding in the "test" set. if (testFile != null) { SequenceFileDirValueIterable<VectorWritable> testIterable = new SequenceFileDirValueIterable<VectorWritable>( new Path(testFile), PathType.GLOB, conf); Iterable<Vector> testDatapoints = IOUtils.getVectorsFromVectorWritableIterable(testIterable); printSummaries(ClusteringUtils.summarizeClusterDistances(testDatapoints, centroids, new SquaredEuclideanDistanceMeasure()), "test"); datapoints = Iterables.concat(trainDatapoints, testDatapoints); } // At this point, all train/test CSVs have been written. We now compute quality metrics. List<OnlineSummarizer> summaries = ClusteringUtils.summarizeClusterDistances(datapoints, centroids, distanceMeasure); List<OnlineSummarizer> compareSummaries = null; if (centroidsCompare != null) { compareSummaries = ClusteringUtils.summarizeClusterDistances(datapoints, centroidsCompare, distanceMeasure); } System.out.printf("[Dunn Index] First: %f", ClusteringUtils.dunnIndex(centroids, distanceMeasure, summaries)); if (compareSummaries != null) { System.out.printf(" Second: %f\n", ClusteringUtils.dunnIndex(centroidsCompare, distanceMeasure, compareSummaries)); } else { System.out.printf("\n"); } System.out.printf("[Davies-Bouldin Index] First: %f", ClusteringUtils.daviesBouldinIndex(centroids, distanceMeasure, summaries)); if (compareSummaries != null) { System.out.printf(" Second: %f\n", ClusteringUtils.daviesBouldinIndex(centroidsCompare, distanceMeasure, compareSummaries)); } else { System.out.printf("\n"); } } catch (IOException e) { System.out.println(e.getMessage()); } finally { Closeables.close(fileOut, false); } return 0; }
From source file:tk.summerway.mahout9.tools.MyClusterDumper.java
License:Apache License
public void printClusters(String[] dictionary) throws Exception { Configuration conf = new Configuration(); if (this.termDictionary != null) { if ("text".equals(dictionaryFormat)) { dictionary = VectorHelper.loadTermDictionary(new File(this.termDictionary)); } else if ("sequencefile".equals(dictionaryFormat)) { dictionary = VectorHelper.loadTermDictionary(conf, this.termDictionary); } else {/*from ww w .j av a 2s.c om*/ throw new IllegalArgumentException("Invalid dictionary format"); } } Writer writer; boolean shouldClose; if (this.outputFile == null) { shouldClose = false; writer = new OutputStreamWriter(System.out, Charsets.UTF_8); } else { shouldClose = true; if (outputFile.getName().startsWith("s3n://")) { Path p = outputPath; FileSystem fs = FileSystem.get(p.toUri(), conf); writer = new OutputStreamWriter(fs.create(p), Charsets.UTF_8); } else { Files.createParentDirs(outputFile); writer = Files.newWriter(this.outputFile, Charsets.UTF_8); } } ClusterWriter clusterWriter = createClusterWriter(writer, dictionary); try { long numWritten = clusterWriter.write(new SequenceFileDirValueIterable<ClusterWritable>( new Path(seqFileDir, "part-*"), PathType.GLOB, conf)); writer.flush(); if (runEvaluation) { MyClusterEvaluator ce = new MyClusterEvaluator(pointsDir.toString(), seqFileDir.toString(), "~/cluster_evaluate_result.txt", measure, 1000L); ce.evaluateClusters(conf); } // if (runEvaluation) { // HadoopUtil.delete(conf, new Path("tmp/representative")); // int numIters = 5; // RepresentativePointsDriver.main(new String[] { "--input", // seqFileDir.toString(), "--output", // "tmp/representative", "--clusteredPoints", // pointsDir.toString(), "--distanceMeasure", // measure.getClass().getName(), "--maxIter", // String.valueOf(numIters) }); // conf.set(RepresentativePointsDriver.DISTANCE_MEASURE_KEY, // measure.getClass().getName()); // conf.set(RepresentativePointsDriver.STATE_IN_KEY, // "tmp/representative/representativePoints-" + numIters); // ClusterEvaluator ce = new ClusterEvaluator(conf, seqFileDir); // writer.append("\n"); // writer.append("Inter-Cluster Density: ") // .append(String.valueOf(ce.interClusterDensity())) // .append("\n"); // writer.append("Intra-Cluster Density: ") // .append(String.valueOf(ce.intraClusterDensity())) // .append("\n"); // CDbwEvaluator cdbw = new CDbwEvaluator(conf, seqFileDir); // writer.append("CDbw Inter-Cluster Density: ") // .append(String.valueOf(cdbw.interClusterDensity())) // .append("\n"); // writer.append("CDbw Intra-Cluster Density: ") // .append(String.valueOf(cdbw.intraClusterDensity())) // .append("\n"); // writer.append("CDbw Separation: ") // .append(String.valueOf(cdbw.separation())).append("\n"); // writer.flush(); // } log.info("Wrote {} clusters", numWritten); } finally { if (shouldClose) { Closeables.close(clusterWriter, false); } else { if (clusterWriter instanceof GraphMLClusterWriter) { clusterWriter.close(); } } } }