Example usage for org.apache.spark.sql.api.java UDF1 UDF1

List of usage examples for org.apache.spark.sql.api.java UDF1 UDF1

Introduction

In this page you can find the example usage for org.apache.spark.sql.api.java UDF1 UDF1.

Prototype

UDF1

Source Link

Usage

From source file:com.andado.spark.examples.ml.JavaTokenizerExample.java

License:Apache License

public static void main(String[] args) {
    SparkSession spark = SparkSession.builder().appName("JavaTokenizerExample").getOrCreate();

    // $example on$
    List<Row> data = Arrays.asList(RowFactory.create(0, "Hi I heard about Spark"),
            RowFactory.create(1, "I wish Java could use case classes"),
            RowFactory.create(2, "Logistic,regression,models,are,neat"));

    StructType schema = new StructType(
            new StructField[] { new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
                    new StructField("sentence", DataTypes.StringType, false, Metadata.empty()) });

    Dataset<Row> sentenceDataFrame = spark.createDataFrame(data, schema);

    Tokenizer tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words");

    RegexTokenizer regexTokenizer = new RegexTokenizer().setInputCol("sentence").setOutputCol("words")
            .setPattern("\\W"); // alternatively .setPattern("\\w+").setGaps(false);

    spark.udf().register("countTokens", new UDF1<WrappedArray, Integer>() {
        @Override/*from  w ww.j  a va2 s . c o  m*/
        public Integer call(WrappedArray words) {
            return words.size();
        }
    }, DataTypes.IntegerType);

    Dataset<Row> tokenized = tokenizer.transform(sentenceDataFrame);
    tokenized.select("sentence", "words").withColumn("tokens", callUDF("countTokens", col("words")))
            .show(false);

    Dataset<Row> regexTokenized = regexTokenizer.transform(sentenceDataFrame);
    regexTokenized.select("sentence", "words").withColumn("tokens", callUDF("countTokens", col("words")))
            .show(false);
    // $example off$

    spark.stop();
}