Example usage for org.apache.mahout.common.iterator.sequencefile SequenceFileDirValueIterable SequenceFileDirValueIterable

List of usage examples for org.apache.mahout.common.iterator.sequencefile SequenceFileDirValueIterable SequenceFileDirValueIterable

Introduction

In this page you can find the example usage for org.apache.mahout.common.iterator.sequencefile SequenceFileDirValueIterable SequenceFileDirValueIterable.

Prototype

public SequenceFileDirValueIterable(Path path, PathType pathType, PathFilter filter, Configuration conf) 

Source Link

Usage

From source file:DisplayClustering.java

License:Apache License

protected static List<Cluster> readClustersWritable(Path clustersIn) {
    List<Cluster> clusters = Lists.newArrayList();
    Configuration conf = new Configuration();
    for (ClusterWritable value : new SequenceFileDirValueIterable<ClusterWritable>(clustersIn, PathType.LIST,
            PathFilters.logsCRCFilter(), conf)) {
        Cluster cluster = value.getValue();
        log.info("Reading Cluster:{} center:{} numPoints:{} radius:{}", cluster.getId(),
                AbstractCluster.formatVector(cluster.getCenter(), null), cluster.getNumObservations(),
                AbstractCluster.formatVector(cluster.getRadius(), null));
        clusters.add(cluster);//from   w  ww  .  j a  va2s  . c  om
    }
    return clusters;
}

From source file:com.modofo.molo.cluster.DisplayClustering.java

License:Apache License

protected static List<Cluster> readClustersWritable(Path clustersIn) {
    List<Cluster> clusters = Lists.newArrayList();
    Configuration conf = new Configuration();
    for (ClusterWritable value : new SequenceFileDirValueIterable<ClusterWritable>(clustersIn, PathType.LIST,
            PathFilters.logsCRCFilter(), conf)) {
        Cluster cluster = value.getValue();
        log.info("Reading Cluster:{} center:{} numPoints:{} radius:{}",
                new Object[] { cluster.getId(), AbstractCluster.formatVector(cluster.getCenter(), null),
                        cluster.getNumObservations(),
                        AbstractCluster.formatVector(cluster.getRadius(), null) });
        clusters.add(cluster);/* www . ja va 2 s  .  c  o m*/
    }
    return clusters;
}

From source file:edu.indiana.d2i.htrc.kmeans.MemCachedKMeansDriver.java

License:Apache License

private static Path buildClustersSeq(Configuration conf, Path input, Path clustersIn, Path output,
        DistanceMeasure measure, int maxIterations, String delta) throws IOException {

    KMeansClusterer clusterer = new KMeansClusterer(measure);
    Collection<Cluster> clusters = Lists.newArrayList();

    MemKMeansUtil.configureWithClusterInfo(conf, clustersIn, clusters);
    if (clusters.isEmpty()) {
        throw new IllegalStateException("Clusters is empty!");
    }/*from   w w w  . ja  v  a 2 s  .  c o m*/
    boolean converged = false;
    int iteration = 1;
    while (!converged && iteration <= maxIterations) {
        log.info("K-Means Iteration: {}", iteration);
        FileSystem fs = FileSystem.get(input.toUri(), conf);
        for (VectorWritable value : new SequenceFileDirValueIterable<VectorWritable>(input, PathType.LIST,
                PathFilters.logsCRCFilter(), conf)) {
            clusterer.addPointToNearestCluster(value.get(), clusters);
        }
        converged = clusterer.testConvergence(clusters, Double.parseDouble(delta));
        Path clustersOut = new Path(output, AbstractCluster.CLUSTERS_DIR + iteration);
        SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, new Path(clustersOut, "part-r-00000"),
                Text.class, Cluster.class);
        try {
            for (Cluster cluster : clusters) {
                if (log.isDebugEnabled()) {
                    log.debug("Writing Cluster:{} center:{} numPoints:{} radius:{} to: {}", new Object[] {
                            cluster.getId(), AbstractCluster.formatVector(cluster.getCenter(), null),
                            cluster.getNumPoints(), AbstractCluster.formatVector(cluster.getRadius(), null),
                            clustersOut.getName() });
                }
                writer.append(new Text(cluster.getIdentifier()), cluster);
            }
        } finally {
            Closeables.closeQuietly(writer);
        }
        clustersIn = clustersOut;
        iteration++;
    }
    Path finalClustersIn = new Path(output, AbstractCluster.CLUSTERS_DIR + (iteration - 1)
            + org.apache.mahout.clustering.Cluster.FINAL_ITERATION_SUFFIX);
    FileSystem.get(conf).rename(new Path(output, AbstractCluster.CLUSTERS_DIR + (iteration - 1)),
            finalClustersIn);
    return finalClustersIn;
}

From source file:edu.indiana.d2i.htrc.kmeans.MemKMeansUtil.java

License:Apache License

/** Configure the mapper with the cluster info */
public static void configureWithClusterInfo(Configuration conf, Path clusterPath,
        Collection<Cluster> clusters) {
    for (Writable value : new SequenceFileDirValueIterable<Writable>(clusterPath, PathType.LIST,
            PathFilters.partFilter(), conf)) {
        Class<? extends Writable> valueClass = value.getClass();
        if (valueClass.equals(Cluster.class)) {
            // get the cluster info
            clusters.add((Cluster) value);
        } else if (valueClass.equals(Canopy.class)) {
            // get the cluster info
            Canopy canopy = (Canopy) value;
            clusters.add(new Cluster(canopy.getCenter(), canopy.getId(), canopy.getMeasure()));
        } else {//from w  w w.j  a  v  a2 s .com
            throw new IllegalStateException("Bad value class: " + valueClass);
        }
    }
}

From source file:io.github.thushear.display.DisplayClustering.java

License:Apache License

protected static List<Cluster> readClusters(Path clustersIn) {
    List<Cluster> clusters = new ArrayList<Cluster>();
    Configuration conf = new Configuration();
    for (Cluster value : new SequenceFileDirValueIterable<Cluster>(clustersIn, PathType.LIST,
            PathFilters.logsCRCFilter(), conf)) {
        log.info("Reading Cluster:{} center:{} numPoints:{} radius:{}",
                new Object[] { value.getId(), AbstractCluster.formatVector(value.getCenter(), null),
                        value.getNumPoints(), AbstractCluster.formatVector(value.getRadius(), null) });
        clusters.add(value);/*from www .ja v  a 2  s  .  c om*/
    }
    return clusters;
}

From source file:org.qcri.pca.PCATest.java

License:Apache License

@Test
public void crossTestIterationOfMapReducePPCASequentialPPCA() throws Exception {
    Matrix C_central = PCACommon.randomMatrix(D, d);
    double ss = PCACommon.randSS();
    InitialValues initValSeq = new InitialValues(C_central, ss);
    InitialValues initValMR = new InitialValues(C_central.clone(), ss);

    //1. run sequential
    Matrix Ye_central = new DenseMatrix(N, D);
    int row = 0;/*from w w  w  . j  a v  a 2  s .c o m*/
    for (VectorWritable vw : new SequenceFileDirValueIterable<VectorWritable>(input, PathType.LIST, null,
            conf)) {
        Ye_central.assignRow(row, vw.get());
        row++;
    }
    double bishopSeqErr = ppcaDriver.runSequential(conf, Ye_central, initValSeq, 1);

    //2. run mapreduce
    DistributedRowMatrix Ye = new DistributedRowMatrix(input, tmp, N, D);
    Ye.setConf(conf);
    double bishopMRErr = ppcaDriver.runMapReduce(conf, Ye, initValMR, output, N, D, d, 1, 1, 1, 1);

    Assert.assertEquals("ss value is different in sequential and mapreduce PCA", initValSeq.ss, initValMR.ss,
            EPSILON);
    double seqCTrace = PCACommon.trace(initValSeq.C);
    double mrCTrace = PCACommon.trace(initValMR.C);
    Assert.assertEquals("C value is different in sequential and mapreduce PCA", seqCTrace, mrCTrace, EPSILON);
    Assert.assertEquals("The PPCA error between sequntial and mapreduce methods is too different: "
            + bishopSeqErr + "!= " + bishopMRErr, bishopSeqErr, bishopMRErr, EPSILON);
}

From source file:org.qcri.pca.SPCADriver.java

/***
 * PPCA: sequential PPCA based on the paper from Tipping and Bishop
 * //from  w  ww. j  ava  2 s .c  o  m
 * @param conf
 *          the configuration
 * @param input
 *          the path to the input matrix Y
 * @param output
 *          the output path (not used currently)
 * @param nRows
 *          number or rows in Y
 * @param nCols
 *          number of columns in Y
 * @param nPCs
 *          number of desired principal components
 * @return the error
 * @throws Exception
 */
double runSequential(Configuration conf, Path input, Path output, final int nRows, final int nCols,
        final int nPCs) throws Exception {
    Matrix centralY = new DenseMatrix(nRows, nCols);
    FileSystem fs = FileSystem.get(input.toUri(), conf);
    if (fs.listStatus(input).length == 0) {
        System.err.println("No file under " + input);
        return 0;
    }
    int row = 0;
    for (VectorWritable vw : new SequenceFileDirValueIterable<VectorWritable>(input, PathType.LIST, null,
            conf)) {
        centralY.assignRow(row, vw.get());
        row++;
    }
    Matrix centralC = PCACommon.randomMatrix(nCols, nPCs);
    double ss = PCACommon.randSS();
    InitialValues initVal = new InitialValues(centralC, ss);
    // Matrix sampledYe = sample(centralY);
    // runSequential(conf, sampledYe, initVal, 100);
    double error = runSequential(conf, centralY, initVal, 100);
    return error;
}

From source file:org.qcri.pca.SPCADriver.java

/**
 * PPCA: sequential PPCA based on the matlab implementation of Jacob Verbeek
 * /*from  w  w  w .  j a v  a2 s  .  c  om*/
 * @param conf
 *          the configuration
 * @param input
 *          the path to the input matrix Y
 * @param output
 *          the output path (not used currently)
 * @param nRows
 *          number or rows in Y
 * @param nCols
 *          number of columns in Y
 * @param nPCs
 *          number of desired principal components
 * @return the error
 * @throws Exception
 */
double runSequential_JacobVersion(Configuration conf, Path input, Path output, final int nRows, final int nCols,
        final int nPCs) throws Exception {
    Matrix centralY = new DenseMatrix(nRows, nCols);
    FileSystem fs = FileSystem.get(input.toUri(), conf);
    if (fs.listStatus(input).length == 0) {
        System.err.println("No file under " + input);
        return 0;
    }
    int row = 0;
    for (VectorWritable vw : new SequenceFileDirValueIterable<VectorWritable>(input, PathType.LIST, null,
            conf)) {
        centralY.assignRow(row, vw.get());
        row++;
    }
    Matrix C = PCACommon.randomMatrix(nCols, nPCs);
    double ss = PCACommon.randSS();
    InitialValues initVal = new InitialValues(C, ss);
    double error = runSequential_JacobVersion(conf, centralY, initVal, 100);
    return error;
}

From source file:sample.DisplayClustering.java

License:Apache License

protected static List<Cluster> readClustersWritable(Path clustersIn) {
    List<Cluster> clusters = new ArrayList<>();
    Configuration conf = new Configuration();
    for (ClusterWritable value : new SequenceFileDirValueIterable<ClusterWritable>(clustersIn, PathType.LIST,
            PathFilters.logsCRCFilter(), conf)) {
        Cluster cluster = value.getValue();
        System.out.println("Cluster: " + cluster.getId() + " "
                + AbstractCluster.formatVector(cluster.getCenter(), null) + " " + cluster.getNumObservations()
                + " " + AbstractCluster.formatVector(cluster.getRadius(), null));
        clusters.add(cluster);/*w  w  w.jav  a2 s  .co  m*/
    }
    return clusters;
}