Example usage for org.apache.spark.sql RowFactory create

List of usage examples for org.apache.spark.sql RowFactory create

Introduction

In this page you can find the example usage for org.apache.spark.sql RowFactory create.

Prototype

public static Row create(Object... values) 

Source Link

Document

Create a Row from the given arguments.

Usage

From source file:dbx.compute.spark.jobs.ml.JavaBucketizerExample.java

License:Apache License

public static void main(String[] args) {
    SparkSession spark = SparkSession.builder().appName("JavaBucketizerExample").getOrCreate();

    // $example on$
    double[] splits = { Double.NEGATIVE_INFINITY, -0.5, 0.0, 0.5, Double.POSITIVE_INFINITY };

    List<Row> data = Arrays.asList(RowFactory.create(-0.5), RowFactory.create(-0.3), RowFactory.create(0.0),
            RowFactory.create(0.2));/*from w  ww  . j av  a2s  .  c  om*/
    StructType schema = new StructType(
            new StructField[] { new StructField("features", DataTypes.DoubleType, false, Metadata.empty()) });
    Dataset<Row> dataFrame = spark.createDataFrame(data, schema);

    Bucketizer bucketizer = new Bucketizer().setInputCol("features").setOutputCol("bucketedFeatures")
            .setSplits(splits);

    // Transform original data into its bucket index.
    Dataset<Row> bucketedData = bucketizer.transform(dataFrame);
    bucketedData.show();
    // $example off$
    spark.stop();
}

From source file:dbx.compute.spark.jobs.ml.JavaCountVectorizerExample.java

License:Apache License

public static void main(String[] args) {
    SparkSession spark = SparkSession.builder().appName("JavaCountVectorizerExample").getOrCreate();

    // $example on$
    // Input data: Each row is a bag of words from a sentence or document.
    List<Row> data = Arrays.asList(RowFactory.create(Arrays.asList("a", "b", "c")),
            RowFactory.create(Arrays.asList("a", "b", "b", "c", "a")));
    StructType schema = new StructType(new StructField[] {
            new StructField("text", new ArrayType(DataTypes.StringType, true), false, Metadata.empty()) });
    Dataset<Row> df = spark.createDataFrame(data, schema);

    // fit a CountVectorizerModel from the corpus
    CountVectorizerModel cvModel = new CountVectorizer().setInputCol("text").setOutputCol("feature")
            .setVocabSize(3).setMinDF(2).fit(df);

    // alternatively, define CountVectorizerModel with a-priori vocabulary
    CountVectorizerModel cvm = new CountVectorizerModel(new String[] { "a", "b", "c" }).setInputCol("text")
            .setOutputCol("feature");

    cvModel.transform(df).show();/*from  w ww . j ava  2  s .c om*/
    // $example off$

    spark.stop();
}

From source file:dbx.compute.spark.jobs.ml.JavaDCTExample.java

License:Apache License

public static void main(String[] args) {
    SparkSession spark = SparkSession.builder().appName("JavaDCTExample").getOrCreate();

    // $example on$
    List<Row> data = Arrays.asList(RowFactory.create(Vectors.dense(0.0, 1.0, -2.0, 3.0)),
            RowFactory.create(Vectors.dense(-1.0, 2.0, 4.0, -7.0)),
            RowFactory.create(Vectors.dense(14.0, -2.0, -5.0, 1.0)));
    StructType schema = new StructType(
            new StructField[] { new StructField("features", new VectorUDT(), false, Metadata.empty()), });
    Dataset<Row> df = spark.createDataFrame(data, schema);
    DCT dct = new DCT().setInputCol("features").setOutputCol("featuresDCT").setInverse(false);
    Dataset<Row> dctDf = dct.transform(df);
    dctDf.select("featuresDCT").show(3);
    // $example off$
    spark.stop();/*from   www .j  a v  a2  s.c o  m*/
}

From source file:dbx.compute.spark.jobs.ml.JavaPCAExample.java

License:Apache License

public static void main(String[] args) {
    SparkSession spark = SparkSession.builder().appName("JavaPCAExample").getOrCreate();

    // $example on$
    List<Row> data = Arrays.asList(
            RowFactory.create(Vectors.sparse(5, new int[] { 1, 3 }, new double[] { 1.0, 7.0 })),
            RowFactory.create(Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0)),
            RowFactory.create(Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)));

    StructType schema = new StructType(
            new StructField[] { new StructField("features", new VectorUDT(), false, Metadata.empty()), });

    Dataset<Row> df = spark.createDataFrame(data, schema);

    PCAModel pca = new PCA().setInputCol("features").setOutputCol("pcaFeatures").setK(3).fit(df);

    Dataset<Row> result = pca.transform(df).select("pcaFeatures");
    result.show();//from   ww  w  . jav  a  2 s.  c  o m
    // $example off$
    spark.stop();
}

From source file:dbx.compute.spark.jobs.ml.JavaPolynomialExpansionExample.java

License:Apache License

public static void main(String[] args) {
    SparkSession spark = SparkSession.builder().appName("JavaPolynomialExpansionExample").getOrCreate();

    // $example on$
    PolynomialExpansion polyExpansion = new PolynomialExpansion().setInputCol("features")
            .setOutputCol("polyFeatures").setDegree(3);

    List<Row> data = Arrays.asList(RowFactory.create(Vectors.dense(-2.0, 2.3)),
            RowFactory.create(Vectors.dense(0.0, 0.0)), RowFactory.create(Vectors.dense(0.6, -1.1)));

    StructType schema = new StructType(
            new StructField[] { new StructField("features", new VectorUDT(), false, Metadata.empty()), });

    Dataset<Row> df = spark.createDataFrame(data, schema);
    Dataset<Row> polyDF = polyExpansion.transform(df);

    List<Row> rows = polyDF.select("polyFeatures").takeAsList(3);
    for (Row r : rows) {
        System.out.println(r.get(0));
    }/*from  w  w  w  . j a  va 2  s .  c  om*/
    // $example off$
    spark.stop();
}

From source file:dbx.compute.spark.jobs.ml.JavaStopWordsRemoverExample.java

License:Apache License

public static void main(String[] args) {
    SparkSession spark = SparkSession.builder().appName("JavaStopWordsRemoverExample").getOrCreate();

    // $example on$
    StopWordsRemover remover = new StopWordsRemover().setInputCol("raw").setOutputCol("filtered");

    List<Row> data = Arrays.asList(RowFactory.create(Arrays.asList("I", "saw", "the", "red", "baloon")),
            RowFactory.create(Arrays.asList("Mary", "had", "a", "little", "lamb")));

    StructType schema = new StructType(new StructField[] {
            new StructField("raw", DataTypes.createArrayType(DataTypes.StringType), false, Metadata.empty()) });

    Dataset<Row> dataset = spark.createDataFrame(data, schema);
    remover.transform(dataset).show();/*from ww  w.j  ava  2s.  c  o m*/
    // $example off$
    spark.stop();
}

From source file:dbx.compute.spark.jobs.ml.JavaVectorSlicerExample.java

License:Apache License

public static void main(String[] args) {
    SparkSession spark = SparkSession.builder().appName("JavaVectorSlicerExample").getOrCreate();

    // $example on$
    Attribute[] attrs = new Attribute[] { NumericAttribute.defaultAttr().withName("f1"),
            NumericAttribute.defaultAttr().withName("f2"), NumericAttribute.defaultAttr().withName("f3") };
    AttributeGroup group = new AttributeGroup("userFeatures", attrs);

    List<Row> data = Lists.newArrayList(
            RowFactory.create(Vectors.sparse(3, new int[] { 0, 1 }, new double[] { -2.0, 2.3 })),
            RowFactory.create(Vectors.dense(-2.0, 2.3, 0.0)));

    Dataset<Row> dataset = spark.createDataFrame(data, (new StructType()).add(group.toStructField()));

    VectorSlicer vectorSlicer = new VectorSlicer().setInputCol("userFeatures").setOutputCol("features");

    vectorSlicer.setIndices(new int[] { 1 }).setNames(new String[] { "f3" });
    // or slicer.setIndices(new int[]{1, 2}), or slicer.setNames(new String[]{"f2", "f3"})

    Dataset<Row> output = vectorSlicer.transform(dataset);

    System.out.println(output.select("userFeatures", "features").first());
    // $example off$
    spark.stop();/*w ww.  jav  a 2  s.  c om*/
}

From source file:dbx.compute.spark.jobs.ml.JavaWord2VecExample.java

License:Apache License

public static void main(String[] args) {
    SparkSession spark = SparkSession.builder().appName("JavaWord2VecExample").getOrCreate();

    // $example on$
    // Input data: Each row is a bag of words from a sentence or document.
    List<Row> data = Arrays.asList(RowFactory.create(Arrays.asList("Hi I heard about Spark".split(" "))),
            RowFactory.create(Arrays.asList("I wish Java could use case classes".split(" "))),
            RowFactory.create(Arrays.asList("Logistic regression models are neat".split(" "))));
    StructType schema = new StructType(new StructField[] {
            new StructField("text", new ArrayType(DataTypes.StringType, true), false, Metadata.empty()) });
    Dataset<Row> documentDF = spark.createDataFrame(data, schema);

    // Learn a mapping from words to Vectors.
    Word2Vec word2Vec = new Word2Vec().setInputCol("text").setOutputCol("result").setVectorSize(3)
            .setMinCount(0);//w  w w  . j av  a 2  s . c  om
    Word2VecModel model = word2Vec.fit(documentDF);
    Dataset<Row> result = model.transform(documentDF);
    for (Row r : result.select("result").takeAsList(3)) {
        System.out.println(r);
    }
    // $example off$

    spark.stop();
}

From source file:fire.nodes.dataset.NodeDatasetFileOrDirectoryCSV.java

License:Apache License

@Override
public void execute(JavaSparkContext ctx, SQLContext sqlContext, WorkflowContext workflowContext,
        DataFrame df) {//from  ww w .j a va 2s. c  o  m

    workflowContext.out("Executing NodeDatasetFileOrDirectoryCSV : " + id);

    // Load a text file and convert each line to a JavaBean.
    JavaRDD<String> people = ctx.textFile(path);

    // filter the header row
    if (filterLinesContaining != null) {
        people = people.filter(new Function<String, Boolean>() {
            @Override
            public Boolean call(String s) throws Exception {
                if (s.contains(filterLinesContaining))
                    return false;

                return true;
            }
        });
    }

    // get schema
    final StructType schema = getSparkSQLSchema();

    // Convert records of the RDD (people) to Rows.
    JavaRDD<Row> rowRDD = people.flatMap(new FlatMapFunction<String, Row>() {

        @Override
        public Iterable<Row> call(String record) throws Exception {
            List<Row> ll = new LinkedList<Row>();

            String[] fields = record.split(separator);

            // skip invalid records
            if (fields.length != schema.length())
                return ll;

            Object f[] = new Object[fields.length];
            int idx = 0;
            for (String field : fields) {
                f[idx] = parseField(fields[idx], schema.fields()[idx]);
                idx++;
            }

            Row row = RowFactory.create(f);
            ll.add(row);

            return ll;
        }
    });

    // Apply the schema to the RDD.
    DataFrame tdf = sqlContext.createDataFrame(rowRDD, schema);

    super.execute(ctx, sqlContext, workflowContext, tdf);
}

From source file:gtl.spark.java.example.apache.ml.JavaCorrelationExample.java

License:Apache License

public static void main(String[] args) {
    SparkSession spark = SparkSession.builder().appName("JavaCorrelationExample").getOrCreate();

    // $example on$
    List<Row> data = Arrays.asList(
            RowFactory.create(Vectors.sparse(4, new int[] { 0, 3 }, new double[] { 1.0, -2.0 })),
            RowFactory.create(Vectors.dense(4.0, 5.0, 0.0, 3.0)),
            RowFactory.create(Vectors.dense(6.0, 7.0, 0.0, 8.0)),
            RowFactory.create(Vectors.sparse(4, new int[] { 0, 3 }, new double[] { 9.0, 1.0 })));

    StructType schema = new StructType(
            new StructField[] { new StructField("features", new VectorUDT(), false, Metadata.empty()), });

    Dataset<Row> df = spark.createDataFrame(data, schema);
    Row r1 = Correlation.corr(df, "features").head();
    System.out.println("Pearson correlation matrix:\n" + r1.get(0).toString());

    Row r2 = Correlation.corr(df, "features", "spearman").head();
    System.out.println("Spearman correlation matrix:\n" + r2.get(0).toString());
    // $example off$

    spark.stop();/*from w w w .j  av a 2  s.  c  o m*/
}