Example usage for org.apache.spark.sql RowFactory create

List of usage examples for org.apache.spark.sql RowFactory create

Introduction

In this page you can find the example usage for org.apache.spark.sql RowFactory create.

Prototype

public static Row create(Object... values) 

Source Link

Document

Create a Row from the given arguments.

Usage

From source file:JavaWord2VecExample.java

License:Apache License

public static void main(String[] args) {

    SparkConf conf = new SparkConf().setAppName("JavaWord2VecExample");
    JavaSparkContext jsc = new JavaSparkContext(conf);
    SQLContext sqlContext = new SQLContext(jsc);

    // $example on$
    // Input data: Each row is a bag of words from a sentence or document.
    JavaRDD<Row> jrdd = jsc//from  www. ja v a2  s . c om
            .parallelize(Arrays.asList(RowFactory.create(Arrays.asList("Hi I heard about Spark".split(" "))),
                    RowFactory.create(Arrays.asList("I wish Java could use case classes".split(" "))),
                    RowFactory.create(Arrays.asList("Logistic regression models are neat".split(" ")))));
    StructType schema = new StructType(new StructField[] {
            new StructField("text", new ArrayType(DataTypes.StringType, true), false, Metadata.empty()) });
    DataFrame documentDF = sqlContext.createDataFrame(jrdd, schema);

    // Learn a mapping from words to Vectors.
    Word2Vec word2Vec = new Word2Vec().setInputCol("text").setOutputCol("result").setVectorSize(3)
            .setMinCount(0);
    Word2VecModel model = word2Vec.fit(documentDF);
    DataFrame result = model.transform(documentDF);
    for (Row r : result.select("result").take(3)) {
        System.out.println(r);
    }
    // $example off$
}

From source file:com.andado.spark.examples.ml.JavaBucketizerExample.java

License:Apache License

public static void main(String[] args) {
    SparkSession spark = SparkSession.builder().appName("JavaBucketizerExample").getOrCreate();

    // $example on$
    double[] splits = { Double.NEGATIVE_INFINITY, -0.5, 0.0, 0.5, Double.POSITIVE_INFINITY };

    List<Row> data = Arrays.asList(RowFactory.create(-999.9), RowFactory.create(-0.5), RowFactory.create(-0.3),
            RowFactory.create(0.0), RowFactory.create(0.2), RowFactory.create(999.9));
    StructType schema = new StructType(
            new StructField[] { new StructField("features", DataTypes.DoubleType, false, Metadata.empty()) });
    Dataset<Row> dataFrame = spark.createDataFrame(data, schema);

    Bucketizer bucketizer = new Bucketizer().setInputCol("features").setOutputCol("bucketedFeatures")
            .setSplits(splits);/*from w  w  w. jav a  2s  .  com*/

    // Transform original data into its bucket index.
    Dataset<Row> bucketedData = bucketizer.transform(dataFrame);

    System.out.println("Bucketizer output with " + (bucketizer.getSplits().length - 1) + " buckets");
    bucketedData.show();
    // $example off$

    spark.stop();
}

From source file:com.andado.spark.examples.ml.JavaCountVectorizerExample.java

License:Apache License

public static void main(String[] args) {
    SparkSession spark = SparkSession.builder().appName("JavaCountVectorizerExample").getOrCreate();

    // $example on$
    // Input data: Each row is a bag of words from a sentence or document.
    List<Row> data = Arrays.asList(RowFactory.create(Arrays.asList("a", "b", "c")),
            RowFactory.create(Arrays.asList("a", "b", "b", "c", "a")));
    StructType schema = new StructType(new StructField[] {
            new StructField("text", new ArrayType(DataTypes.StringType, true), false, Metadata.empty()) });
    Dataset<Row> df = spark.createDataFrame(data, schema);

    // fit a CountVectorizerModel from the corpus
    CountVectorizerModel cvModel = new CountVectorizer().setInputCol("text").setOutputCol("feature")
            .setVocabSize(3).setMinDF(2).fit(df);

    // alternatively, define CountVectorizerModel with a-priori vocabulary
    CountVectorizerModel cvm = new CountVectorizerModel(new String[] { "a", "b", "c" }).setInputCol("text")
            .setOutputCol("feature");

    cvModel.transform(df).show(false);//from   ww  w.ja  v a2s. com
    // $example off$

    spark.stop();
}

From source file:com.andado.spark.examples.ml.JavaDCTExample.java

License:Apache License

public static void main(String[] args) {
    SparkSession spark = SparkSession.builder().appName("JavaDCTExample").getOrCreate();

    // $example on$
    List<Row> data = Arrays.asList(RowFactory.create(Vectors.dense(0.0, 1.0, -2.0, 3.0)),
            RowFactory.create(Vectors.dense(-1.0, 2.0, 4.0, -7.0)),
            RowFactory.create(Vectors.dense(14.0, -2.0, -5.0, 1.0)));
    StructType schema = new StructType(
            new StructField[] { new StructField("features", new VectorUDT(), false, Metadata.empty()), });
    Dataset<Row> df = spark.createDataFrame(data, schema);

    DCT dct = new DCT().setInputCol("features").setOutputCol("featuresDCT").setInverse(false);

    Dataset<Row> dctDf = dct.transform(df);

    dctDf.select("featuresDCT").show(false);
    // $example off$

    spark.stop();//from  ww w  .j a  va2 s .c om
}

From source file:com.andado.spark.examples.ml.JavaPCAExample.java

License:Apache License

public static void main(String[] args) {
    SparkSession spark = SparkSession.builder().appName("JavaPCAExample").getOrCreate();

    // $example on$
    List<Row> data = Arrays.asList(
            RowFactory.create(Vectors.sparse(5, new int[] { 1, 3 }, new double[] { 1.0, 7.0 })),
            RowFactory.create(Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0)),
            RowFactory.create(Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)));

    StructType schema = new StructType(
            new StructField[] { new StructField("features", new VectorUDT(), false, Metadata.empty()), });

    Dataset<Row> df = spark.createDataFrame(data, schema);

    PCAModel pca = new PCA().setInputCol("features").setOutputCol("pcaFeatures").setK(3).fit(df);

    Dataset<Row> result = pca.transform(df).select("pcaFeatures");
    result.show(false);// ww w.j  a v  a  2 s  .co  m
    // $example off$
    spark.stop();
}

From source file:com.andado.spark.examples.ml.JavaPolynomialExpansionExample.java

License:Apache License

public static void main(String[] args) {
    SparkSession spark = SparkSession.builder().appName("JavaPolynomialExpansionExample").getOrCreate();

    // $example on$
    PolynomialExpansion polyExpansion = new PolynomialExpansion().setInputCol("features")
            .setOutputCol("polyFeatures").setDegree(3);

    List<Row> data = Arrays.asList(RowFactory.create(Vectors.dense(2.0, 1.0)),
            RowFactory.create(Vectors.dense(0.0, 0.0)), RowFactory.create(Vectors.dense(3.0, -1.0)));
    StructType schema = new StructType(
            new StructField[] { new StructField("features", new VectorUDT(), false, Metadata.empty()), });
    Dataset<Row> df = spark.createDataFrame(data, schema);

    Dataset<Row> polyDF = polyExpansion.transform(df);
    polyDF.show(false);/*from  ww  w  .  java2 s  .c o m*/
    // $example off$

    spark.stop();
}

From source file:com.andado.spark.examples.ml.JavaStopWordsRemoverExample.java

License:Apache License

public static void main(String[] args) {
    SparkSession spark = SparkSession.builder().appName("JavaStopWordsRemoverExample").getOrCreate();

    // $example on$
    StopWordsRemover remover = new StopWordsRemover().setInputCol("raw").setOutputCol("filtered");

    List<Row> data = Arrays.asList(RowFactory.create(Arrays.asList("I", "saw", "the", "red", "balloon")),
            RowFactory.create(Arrays.asList("Mary", "had", "a", "little", "lamb")));

    StructType schema = new StructType(new StructField[] {
            new StructField("raw", DataTypes.createArrayType(DataTypes.StringType), false, Metadata.empty()) });

    Dataset<Row> dataset = spark.createDataFrame(data, schema);
    remover.transform(dataset).show(false);
    // $example off$
    spark.stop();//from  w w w  .  ja v a 2 s  .  c o  m
}

From source file:com.andado.spark.examples.ml.JavaVectorSlicerExample.java

License:Apache License

public static void main(String[] args) {
    SparkSession spark = SparkSession.builder().appName("JavaVectorSlicerExample").getOrCreate();

    // $example on$
    Attribute[] attrs = new Attribute[] { NumericAttribute.defaultAttr().withName("f1"),
            NumericAttribute.defaultAttr().withName("f2"), NumericAttribute.defaultAttr().withName("f3") };
    AttributeGroup group = new AttributeGroup("userFeatures", attrs);

    List<Row> data = Lists.newArrayList(
            RowFactory.create(Vectors.sparse(3, new int[] { 0, 1 }, new double[] { -2.0, 2.3 })),
            RowFactory.create(Vectors.dense(-2.0, 2.3, 0.0)));

    Dataset<Row> dataset = spark.createDataFrame(data, (new StructType()).add(group.toStructField()));

    VectorSlicer vectorSlicer = new VectorSlicer().setInputCol("userFeatures").setOutputCol("features");

    vectorSlicer.setIndices(new int[] { 1 }).setNames(new String[] { "f3" });
    // or slicer.setIndices(new int[]{1, 2}), or slicer.setNames(new String[]{"f2", "f3"})

    Dataset<Row> output = vectorSlicer.transform(dataset);
    output.show(false);//from  w w w .  j a  v a 2s. c  o  m
    // $example off$

    spark.stop();
}

From source file:com.andado.spark.examples.ml.JavaWord2VecExample.java

License:Apache License

public static void main(String[] args) {
    SparkSession spark = SparkSession.builder().appName("JavaWord2VecExample").getOrCreate();

    // $example on$
    // Input data: Each row is a bag of words from a sentence or document.
    List<Row> data = Arrays.asList(RowFactory.create(Arrays.asList("Hi I heard about Spark".split(" "))),
            RowFactory.create(Arrays.asList("I wish Java could use case classes".split(" "))),
            RowFactory.create(Arrays.asList("Logistic regression models are neat".split(" "))));
    StructType schema = new StructType(new StructField[] {
            new StructField("text", new ArrayType(DataTypes.StringType, true), false, Metadata.empty()) });
    Dataset<Row> documentDF = spark.createDataFrame(data, schema);

    // Learn a mapping from words to Vectors.
    Word2Vec word2Vec = new Word2Vec().setInputCol("text").setOutputCol("result").setVectorSize(3)
            .setMinCount(0);//  w  ww  .  j av a2s . com

    Word2VecModel model = word2Vec.fit(documentDF);
    Dataset<Row> result = model.transform(documentDF);

    for (Row row : result.collectAsList()) {
        List<String> text = row.getList(0);
        Vector vector = (Vector) row.get(1);
        System.out.println("Text: " + text + " => \nVector: " + vector + "\n");
    }
    // $example off$

    spark.stop();
}

From source file:com.bosscs.spark.commons.utils.CellsUtils.java

License:Apache License

/**
 * Creates a SparkSQL Row object from a Stratio Cells object
 *
 * @param cells Stratio Cells object for transforming.
 * @return SparkSQL Row created from Cells.
 *//*from w w  w  . j a  va  2s  . co m*/
public static Row getRowFromCells(Cells cells) {
    Object[] values = cells.getCellValues().toArray();
    return RowFactory.create(values);
}