Example usage for org.apache.mahout.common.iterator.sequencefile SequenceFileDirValueIterable SequenceFileDirValueIterable

List of usage examples for org.apache.mahout.common.iterator.sequencefile SequenceFileDirValueIterable SequenceFileDirValueIterable

Introduction

In this page you can find the example usage for org.apache.mahout.common.iterator.sequencefile SequenceFileDirValueIterable SequenceFileDirValueIterable.

Prototype

public SequenceFileDirValueIterable(Path path, PathType pathType, Configuration conf) 

Source Link

Usage

From source file:org.conan.mymahout.clustering.streaming.tools.ClusterQualitySummarizer.java

License:Apache License

public int run(String[] args) throws IOException {
    if (!parseArgs(args)) {
        return -1;
    }/* w  w w  .  ja  va2  s .  com*/

    Configuration conf = new Configuration();
    try {
        //      Configuration.dumpConfiguration(conf, new OutputStreamWriter(System.out));

        fileOut = new PrintWriter(new FileOutputStream(outputFile));
        fileOut.printf("cluster,distance.mean,distance.sd,distance.q0,distance.q1,distance.q2,distance.q3,"
                + "distance.q4,count,is.train\n");

        // Reading in the centroids (both pairs, if they exist).
        List<Centroid> centroids;
        List<Centroid> centroidsCompare = null;
        if (mahoutKMeansFormat) {
            SequenceFileDirValueIterable<ClusterWritable> clusterIterable = new SequenceFileDirValueIterable<ClusterWritable>(
                    new Path(centroidFile), PathType.GLOB, conf);
            centroids = Lists.newArrayList(IOUtils.getCentroidsFromClusterWritableIterable(clusterIterable));
        } else {
            SequenceFileDirValueIterable<CentroidWritable> centroidIterable = new SequenceFileDirValueIterable<CentroidWritable>(
                    new Path(centroidFile), PathType.GLOB, conf);
            centroids = Lists.newArrayList(IOUtils.getCentroidsFromCentroidWritableIterable(centroidIterable));
        }

        if (centroidCompareFile != null) {
            if (mahoutKMeansFormatCompare) {
                SequenceFileDirValueIterable<ClusterWritable> clusterCompareIterable = new SequenceFileDirValueIterable<ClusterWritable>(
                        new Path(centroidCompareFile), PathType.GLOB, conf);
                centroidsCompare = Lists
                        .newArrayList(IOUtils.getCentroidsFromClusterWritableIterable(clusterCompareIterable));
            } else {
                SequenceFileDirValueIterable<CentroidWritable> centroidCompareIterable = new SequenceFileDirValueIterable<CentroidWritable>(
                        new Path(centroidCompareFile), PathType.GLOB, conf);
                centroidsCompare = Lists.newArrayList(
                        IOUtils.getCentroidsFromCentroidWritableIterable(centroidCompareIterable));
            }
        }

        // Reading in the "training" set.
        SequenceFileDirValueIterable<VectorWritable> trainIterable = new SequenceFileDirValueIterable<VectorWritable>(
                new Path(trainFile), PathType.GLOB, conf);
        Iterable<Vector> trainDatapoints = IOUtils.getVectorsFromVectorWritableIterable(trainIterable);
        Iterable<Vector> datapoints = trainDatapoints;

        printSummaries(ClusteringUtils.summarizeClusterDistances(trainDatapoints, centroids,
                new SquaredEuclideanDistanceMeasure()), "train");

        // Also adding in the "test" set.
        if (testFile != null) {
            SequenceFileDirValueIterable<VectorWritable> testIterable = new SequenceFileDirValueIterable<VectorWritable>(
                    new Path(testFile), PathType.GLOB, conf);
            Iterable<Vector> testDatapoints = IOUtils.getVectorsFromVectorWritableIterable(testIterable);

            printSummaries(ClusteringUtils.summarizeClusterDistances(testDatapoints, centroids,
                    new SquaredEuclideanDistanceMeasure()), "test");

            datapoints = Iterables.concat(trainDatapoints, testDatapoints);
        }

        // At this point, all train/test CSVs have been written. We now compute quality metrics.
        List<OnlineSummarizer> summaries = ClusteringUtils.summarizeClusterDistances(datapoints, centroids,
                distanceMeasure);
        List<OnlineSummarizer> compareSummaries = null;
        if (centroidsCompare != null) {
            compareSummaries = ClusteringUtils.summarizeClusterDistances(datapoints, centroidsCompare,
                    distanceMeasure);
        }
        System.out.printf("[Dunn Index] First: %f",
                ClusteringUtils.dunnIndex(centroids, distanceMeasure, summaries));
        if (compareSummaries != null) {
            System.out.printf(" Second: %f\n",
                    ClusteringUtils.dunnIndex(centroidsCompare, distanceMeasure, compareSummaries));
        } else {
            System.out.printf("\n");
        }
        System.out.printf("[Davies-Bouldin Index] First: %f",
                ClusteringUtils.daviesBouldinIndex(centroids, distanceMeasure, summaries));
        if (compareSummaries != null) {
            System.out.printf(" Second: %f\n",
                    ClusteringUtils.daviesBouldinIndex(centroidsCompare, distanceMeasure, compareSummaries));
        } else {
            System.out.printf("\n");
        }
    } catch (IOException e) {
        System.out.println(e.getMessage());
    } finally {
        Closeables.close(fileOut, false);
    }
    return 0;
}

From source file:tk.summerway.mahout9.tools.MyClusterDumper.java

License:Apache License

public void printClusters(String[] dictionary) throws Exception {
    Configuration conf = new Configuration();

    if (this.termDictionary != null) {
        if ("text".equals(dictionaryFormat)) {
            dictionary = VectorHelper.loadTermDictionary(new File(this.termDictionary));
        } else if ("sequencefile".equals(dictionaryFormat)) {
            dictionary = VectorHelper.loadTermDictionary(conf, this.termDictionary);
        } else {/*from ww w  .j av  a  2s.c om*/
            throw new IllegalArgumentException("Invalid dictionary format");
        }
    }

    Writer writer;
    boolean shouldClose;
    if (this.outputFile == null) {
        shouldClose = false;
        writer = new OutputStreamWriter(System.out, Charsets.UTF_8);
    } else {
        shouldClose = true;
        if (outputFile.getName().startsWith("s3n://")) {
            Path p = outputPath;
            FileSystem fs = FileSystem.get(p.toUri(), conf);
            writer = new OutputStreamWriter(fs.create(p), Charsets.UTF_8);
        } else {
            Files.createParentDirs(outputFile);
            writer = Files.newWriter(this.outputFile, Charsets.UTF_8);
        }
    }
    ClusterWriter clusterWriter = createClusterWriter(writer, dictionary);
    try {
        long numWritten = clusterWriter.write(new SequenceFileDirValueIterable<ClusterWritable>(
                new Path(seqFileDir, "part-*"), PathType.GLOB, conf));

        writer.flush();
        if (runEvaluation) {
            MyClusterEvaluator ce = new MyClusterEvaluator(pointsDir.toString(), seqFileDir.toString(),
                    "~/cluster_evaluate_result.txt", measure, 1000L);
            ce.evaluateClusters(conf);
        }
        //            if (runEvaluation) {
        //                HadoopUtil.delete(conf, new Path("tmp/representative"));
        //                int numIters = 5;
        //                RepresentativePointsDriver.main(new String[] { "--input",
        //                        seqFileDir.toString(), "--output",
        //                        "tmp/representative", "--clusteredPoints",
        //                        pointsDir.toString(), "--distanceMeasure",
        //                        measure.getClass().getName(), "--maxIter",
        //                        String.valueOf(numIters) });
        //                conf.set(RepresentativePointsDriver.DISTANCE_MEASURE_KEY,
        //                        measure.getClass().getName());
        //                conf.set(RepresentativePointsDriver.STATE_IN_KEY,
        //                        "tmp/representative/representativePoints-" + numIters);
        //                ClusterEvaluator ce = new ClusterEvaluator(conf, seqFileDir);
        //                writer.append("\n");
        //                writer.append("Inter-Cluster Density: ")
        //                        .append(String.valueOf(ce.interClusterDensity()))
        //                        .append("\n");
        //                writer.append("Intra-Cluster Density: ")
        //                        .append(String.valueOf(ce.intraClusterDensity()))
        //                        .append("\n");
        //                CDbwEvaluator cdbw = new CDbwEvaluator(conf, seqFileDir);
        //                writer.append("CDbw Inter-Cluster Density: ")
        //                        .append(String.valueOf(cdbw.interClusterDensity()))
        //                        .append("\n");
        //                writer.append("CDbw Intra-Cluster Density: ")
        //                        .append(String.valueOf(cdbw.intraClusterDensity()))
        //                        .append("\n");
        //                writer.append("CDbw Separation: ")
        //                        .append(String.valueOf(cdbw.separation())).append("\n");
        //                writer.flush();
        //            }
        log.info("Wrote {} clusters", numWritten);
    } finally {
        if (shouldClose) {
            Closeables.close(clusterWriter, false);
        } else {
            if (clusterWriter instanceof GraphMLClusterWriter) {
                clusterWriter.close();
            }
        }
    }
}