List of usage examples for org.apache.spark.sql.api.java UDF1 UDF1
UDF1
From source file:com.andado.spark.examples.ml.JavaTokenizerExample.java
License:Apache License
public static void main(String[] args) { SparkSession spark = SparkSession.builder().appName("JavaTokenizerExample").getOrCreate(); // $example on$ List<Row> data = Arrays.asList(RowFactory.create(0, "Hi I heard about Spark"), RowFactory.create(1, "I wish Java could use case classes"), RowFactory.create(2, "Logistic,regression,models,are,neat")); StructType schema = new StructType( new StructField[] { new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), new StructField("sentence", DataTypes.StringType, false, Metadata.empty()) }); Dataset<Row> sentenceDataFrame = spark.createDataFrame(data, schema); Tokenizer tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words"); RegexTokenizer regexTokenizer = new RegexTokenizer().setInputCol("sentence").setOutputCol("words") .setPattern("\\W"); // alternatively .setPattern("\\w+").setGaps(false); spark.udf().register("countTokens", new UDF1<WrappedArray, Integer>() { @Override/*from w ww.j a va2 s . c o m*/ public Integer call(WrappedArray words) { return words.size(); } }, DataTypes.IntegerType); Dataset<Row> tokenized = tokenizer.transform(sentenceDataFrame); tokenized.select("sentence", "words").withColumn("tokens", callUDF("countTokens", col("words"))) .show(false); Dataset<Row> regexTokenized = regexTokenizer.transform(sentenceDataFrame); regexTokenized.select("sentence", "words").withColumn("tokens", callUDF("countTokens", col("words"))) .show(false); // $example off$ spark.stop(); }