List of usage examples for org.apache.spark.api.java.function MapFunction MapFunction
MapFunction
From source file:com.andado.spark.examples.sql.hive.JavaSparkHiveExample.java
License:Apache License
public static void main(String[] args) { // $example on:spark_hive$ // warehouseLocation points to the default location for managed databases and tables String warehouseLocation = "spark-warehouse"; SparkSession spark = SparkSession.builder().appName("Java Spark Hive Example") .config("spark.sql.warehouse.dir", warehouseLocation).enableHiveSupport().getOrCreate(); spark.sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING)"); spark.sql("LOAD DATA LOCAL INPATH 'examples/src/main/resources/kv1.txt' INTO TABLE src"); // Queries are expressed in HiveQL spark.sql("SELECT * FROM src").show(); // +---+-------+ // |key| value| // +---+-------+ // |238|val_238| // | 86| val_86| // |311|val_311| // ...//w w w . j av a 2 s . c o m // Aggregation queries are also supported. spark.sql("SELECT COUNT(*) FROM src").show(); // +--------+ // |count(1)| // +--------+ // | 500 | // +--------+ // The results of SQL queries are themselves DataFrames and support all normal functions. Dataset<Row> sqlDF = spark.sql("SELECT key, value FROM src WHERE key < 10 ORDER BY key"); // The items in DaraFrames are of type Row, which lets you to access each column by ordinal. Dataset<String> stringsDS = sqlDF.map(new MapFunction<Row, String>() { @Override public String call(Row row) throws Exception { return "Key: " + row.get(0) + ", Value: " + row.get(1); } }, Encoders.STRING()); stringsDS.show(); // +--------------------+ // | value| // +--------------------+ // |Key: 0, Value: val_0| // |Key: 0, Value: val_0| // |Key: 0, Value: val_0| // ... // You can also use DataFrames to create temporary views within a SparkSession. List<Record> records = new ArrayList<>(); for (int key = 1; key < 100; key++) { Record record = new Record(); record.setKey(key); record.setValue("val_" + key); records.add(record); } Dataset<Row> recordsDF = spark.createDataFrame(records, Record.class); recordsDF.createOrReplaceTempView("records"); // Queries can then join DataFrames data with data stored in Hive. spark.sql("SELECT * FROM records r JOIN src s ON r.key = s.key").show(); // +---+------+---+------+ // |key| value|key| value| // +---+------+---+------+ // | 2| val_2| 2| val_2| // | 2| val_2| 2| val_2| // | 4| val_4| 4| val_4| // ... // $example off:spark_hive$ spark.stop(); }
From source file:com.andado.spark.examples.sql.JavaSparkSQLExample.java
License:Apache License
private static void runDatasetCreationExample(SparkSession spark) { // $example on:create_ds$ // Create an instance of a Bean class Person person = new Person(); person.setName("Andy"); person.setAge(32);/* www . j av a 2 s . c o m*/ // Encoders are created for Java beans Encoder<Person> personEncoder = Encoders.bean(Person.class); Dataset<Person> javaBeanDS = spark.createDataset(Collections.singletonList(person), personEncoder); javaBeanDS.show(); // +---+----+ // |age|name| // +---+----+ // | 32|Andy| // +---+----+ // Encoders for most common types are provided in class Encoders Encoder<Integer> integerEncoder = Encoders.INT(); Dataset<Integer> primitiveDS = spark.createDataset(Arrays.asList(1, 2, 3), integerEncoder); Dataset<Integer> transformedDS = primitiveDS.map(new MapFunction<Integer, Integer>() { @Override public Integer call(Integer value) throws Exception { return value + 1; } }, integerEncoder); transformedDS.collect(); // Returns [2, 3, 4] // DataFrames can be converted to a Dataset by providing a class. Mapping based on name String path = "examples/src/main/resources/people.json"; Dataset<Person> peopleDS = spark.read().json(path).as(personEncoder); peopleDS.show(); // +----+-------+ // | age| name| // +----+-------+ // |null|Michael| // | 30| Andy| // | 19| Justin| // +----+-------+ // $example off:create_ds$ }
From source file:com.andado.spark.examples.sql.JavaSparkSQLExample.java
License:Apache License
private static void runInferSchemaExample(SparkSession spark) { // $example on:schema_inferring$ // Create an RDD of Person objects from a text file JavaRDD<Person> peopleRDD = spark.read().textFile("examples/src/main/resources/people.txt").javaRDD() .map(new Function<String, Person>() { @Override// w w w . j ava 2 s.c om public Person call(String line) throws Exception { String[] parts = line.split(","); Person person = new Person(); person.setName(parts[0]); person.setAge(Integer.parseInt(parts[1].trim())); return person; } }); // Apply a schema to an RDD of JavaBeans to get a DataFrame Dataset<Row> peopleDF = spark.createDataFrame(peopleRDD, Person.class); // Register the DataFrame as a temporary view peopleDF.createOrReplaceTempView("people"); // SQL statements can be run by using the sql methods provided by spark Dataset<Row> teenagersDF = spark.sql("SELECT name FROM people WHERE age BETWEEN 13 AND 19"); // The columns of a row in the result can be accessed by field index Encoder<String> stringEncoder = Encoders.STRING(); Dataset<String> teenagerNamesByIndexDF = teenagersDF.map(new MapFunction<Row, String>() { @Override public String call(Row row) throws Exception { return "Name: " + row.getString(0); } }, stringEncoder); teenagerNamesByIndexDF.show(); // +------------+ // | value| // +------------+ // |Name: Justin| // +------------+ // or by field name Dataset<String> teenagerNamesByFieldDF = teenagersDF.map(new MapFunction<Row, String>() { @Override public String call(Row row) throws Exception { return "Name: " + row.<String>getAs("name"); } }, stringEncoder); teenagerNamesByFieldDF.show(); // +------------+ // | value| // +------------+ // |Name: Justin| // +------------+ // $example off:schema_inferring$ }
From source file:com.andado.spark.examples.sql.JavaSparkSQLExample.java
License:Apache License
private static void runProgrammaticSchemaExample(SparkSession spark) { // $example on:programmatic_schema$ // Create an RDD JavaRDD<String> peopleRDD = spark.sparkContext().textFile("examples/src/main/resources/people.txt", 1) .toJavaRDD();/* w w w. ja v a 2s.com*/ // The schema is encoded in a string String schemaString = "name age"; // Generate the schema based on the string of schema List<StructField> fields = new ArrayList<>(); for (String fieldName : schemaString.split(" ")) { StructField field = DataTypes.createStructField(fieldName, DataTypes.StringType, true); fields.add(field); } StructType schema = DataTypes.createStructType(fields); // Convert records of the RDD (people) to Rows JavaRDD<Row> rowRDD = peopleRDD.map(new Function<String, Row>() { @Override public Row call(String record) throws Exception { String[] attributes = record.split(","); return RowFactory.create(attributes[0], attributes[1].trim()); } }); // Apply the schema to the RDD Dataset<Row> peopleDataFrame = spark.createDataFrame(rowRDD, schema); // Creates a temporary view using the DataFrame peopleDataFrame.createOrReplaceTempView("people"); // SQL can be run over a temporary view created using DataFrames Dataset<Row> results = spark.sql("SELECT name FROM people"); // The results of SQL queries are DataFrames and support all the normal RDD operations // The columns of a row in the result can be accessed by field index or by field name Dataset<String> namesDS = results.map(new MapFunction<Row, String>() { @Override public String call(Row row) throws Exception { return "Name: " + row.getString(0); } }, Encoders.STRING()); namesDS.show(); // +-------------+ // | value| // +-------------+ // |Name: Michael| // | Name: Andy| // | Name: Justin| // +-------------+ // $example off:programmatic_schema$ }
From source file:com.andado.spark.examples.sql.JavaSQLDataSourceExample.java
License:Apache License
private static void runBasicParquetExample(SparkSession spark) { // $example on:basic_parquet_example$ Dataset<Row> peopleDF = spark.read().json("examples/src/main/resources/people.json"); // DataFrames can be saved as Parquet files, maintaining the schema information peopleDF.write().parquet("people.parquet"); // Read in the Parquet file created above. // Parquet files are self-describing so the schema is preserved // The result of loading a parquet file is also a DataFrame Dataset<Row> parquetFileDF = spark.read().parquet("people.parquet"); // Parquet files can also be used to create a temporary view and then used in SQL statements parquetFileDF.createOrReplaceTempView("parquetFile"); Dataset<Row> namesDF = spark.sql("SELECT name FROM parquetFile WHERE age BETWEEN 13 AND 19"); Dataset<String> namesDS = namesDF.map(new MapFunction<Row, String>() { public String call(Row row) { return "Name: " + row.getString(0); }//from w w w. jav a2 s . c o m }, Encoders.STRING()); namesDS.show(); // +------------+ // | value| // +------------+ // |Name: Justin| // +------------+ // $example off:basic_parquet_example$ }
From source file:dbx.compute.spark.jobs.sql.hive.JavaSparkHiveExample.java
License:Apache License
public static void main(String[] args) { // $example on:spark_hive$ // warehouseLocation points to the default location for managed databases and tables String warehouseLocation = "file:" + System.getProperty("user.dir") + "spark-warehouse"; SparkSession spark = SparkSession.builder().appName("Java Spark Hive Example") .config("spark.sql.warehouse.dir", warehouseLocation).enableHiveSupport().getOrCreate(); spark.sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING)"); spark.sql("LOAD DATA LOCAL INPATH 'examples/src/main/resources/kv1.txt' INTO TABLE src"); // Queries are expressed in HiveQL spark.sql("SELECT * FROM src").show(); // +---+-------+ // |key| value| // +---+-------+ // |238|val_238| // | 86| val_86| // |311|val_311| // .../* w w w . j a v a 2 s. c om*/ // Aggregation queries are also supported. spark.sql("SELECT COUNT(*) FROM src").show(); // +--------+ // |count(1)| // +--------+ // | 500 | // +--------+ // The results of SQL queries are themselves DataFrames and support all normal functions. Dataset<Row> sqlDF = spark.sql("SELECT key, value FROM src WHERE key < 10 ORDER BY key"); // The items in DaraFrames are of type Row, which lets you to access each column by ordinal. Dataset<String> stringsDS = sqlDF.map(new MapFunction<Row, String>() { public String call(Row row) throws Exception { return "Key: " + row.get(0) + ", Value: " + row.get(1); } }, Encoders.STRING()); stringsDS.show(); // +--------------------+ // | value| // +--------------------+ // |Key: 0, Value: val_0| // |Key: 0, Value: val_0| // |Key: 0, Value: val_0| // ... // You can also use DataFrames to create temporary views within a HiveContext. List<Record> records = new ArrayList<Record>(); for (int key = 1; key < 100; key++) { Record record = new Record(); record.setKey(key); record.setValue("val_" + key); records.add(record); } Dataset<Row> recordsDF = spark.createDataFrame(records, Record.class); recordsDF.createOrReplaceTempView("records"); // Queries can then join DataFrames data with data stored in Hive. spark.sql("SELECT * FROM records r JOIN src s ON r.key = s.key").show(); // +---+------+---+------+ // |key| value|key| value| // +---+------+---+------+ // | 2| val_2| 2| val_2| // | 2| val_2| 2| val_2| // | 4| val_4| 4| val_4| // ... // $example off:spark_hive$ spark.stop(); }
From source file:gtl.spark.java.example.apache.sql.streaming.JavaStructuredSessionization.java
License:Apache License
public static void main(String[] args) throws Exception { if (args.length < 2) { System.err.println("Usage: JavaStructuredSessionization <hostname> <port>"); System.exit(1);/*w w w .j a v a 2 s. c om*/ } String host = args[0]; int port = Integer.parseInt(args[1]); SparkSession spark = SparkSession.builder().appName("JavaStructuredSessionization").getOrCreate(); // Create DataFrame representing the stream of input lines from connection to host:port Dataset<Row> lines = spark.readStream().format("socket").option("host", host).option("port", port) .option("includeTimestamp", true).load(); FlatMapFunction<LineWithTimestamp, Event> linesToEvents = new FlatMapFunction<LineWithTimestamp, Event>() { @Override public Iterator<Event> call(LineWithTimestamp lineWithTimestamp) throws Exception { ArrayList<Event> eventList = new ArrayList<Event>(); for (String word : lineWithTimestamp.getLine().split(" ")) { eventList.add(new Event(word, lineWithTimestamp.getTimestamp())); } return eventList.iterator(); } }; // Split the lines into words, treat words as sessionId of events Dataset<Event> events = lines.withColumnRenamed("value", "line").as(Encoders.bean(LineWithTimestamp.class)) .flatMap(linesToEvents, Encoders.bean(Event.class)); // Sessionize the events. Track number of events, start and end timestamps of session, and // and report session updates. // // Step 1: Define the state update function MapGroupsWithStateFunction<String, Event, SessionInfo, SessionUpdate> stateUpdateFunc = new MapGroupsWithStateFunction<String, Event, SessionInfo, SessionUpdate>() { @Override public SessionUpdate call(String sessionId, Iterator<Event> events, GroupState<SessionInfo> state) throws Exception { // If timed out, then remove session and send final update if (state.hasTimedOut()) { SessionUpdate finalUpdate = new SessionUpdate(sessionId, state.get().calculateDuration(), state.get().getNumEvents(), true); state.remove(); return finalUpdate; } else { // Find max and min timestamps in events long maxTimestampMs = Long.MIN_VALUE; long minTimestampMs = Long.MAX_VALUE; int numNewEvents = 0; while (events.hasNext()) { Event e = events.next(); long timestampMs = e.getTimestamp().getTime(); maxTimestampMs = Math.max(timestampMs, maxTimestampMs); minTimestampMs = Math.min(timestampMs, minTimestampMs); numNewEvents += 1; } SessionInfo updatedSession = new SessionInfo(); // Update start and end timestamps in session if (state.exists()) { SessionInfo oldSession = state.get(); updatedSession.setNumEvents(oldSession.numEvents + numNewEvents); updatedSession.setStartTimestampMs(oldSession.startTimestampMs); updatedSession.setEndTimestampMs(Math.max(oldSession.endTimestampMs, maxTimestampMs)); } else { updatedSession.setNumEvents(numNewEvents); updatedSession.setStartTimestampMs(minTimestampMs); updatedSession.setEndTimestampMs(maxTimestampMs); } state.update(updatedSession); // Set timeout such that the session will be expired if no data received for 10 seconds state.setTimeoutDuration("10 seconds"); return new SessionUpdate(sessionId, state.get().calculateDuration(), state.get().getNumEvents(), false); } } }; // Step 2: Apply the state update function to the events streaming Dataset grouped by sessionId Dataset<SessionUpdate> sessionUpdates = events.groupByKey(new MapFunction<Event, String>() { @Override public String call(Event event) throws Exception { return event.getSessionId(); } }, Encoders.STRING()).mapGroupsWithState(stateUpdateFunc, Encoders.bean(SessionInfo.class), Encoders.bean(SessionUpdate.class), GroupStateTimeout.ProcessingTimeTimeout()); // Start running the query that prints the session updates to the console StreamingQuery query = sessionUpdates.writeStream().outputMode("update").format("console").start(); query.awaitTermination(); }
From source file:io.hops.examples.spark.kafka.StructuredStreamingKafka.java
License:Apache License
public static void main(String[] args) throws StreamingQueryException, InterruptedException { final String type = args[0]; //Producer// www . j a v a2s . c o m if (!Strings.isNullOrEmpty(type) && type.equalsIgnoreCase("producer")) { Set<String> topicsSet = new HashSet<>(Hops.getTopics()); SparkConf sparkConf = new SparkConf().setAppName(Hops.getJobName()); JavaSparkContext jsc = new JavaSparkContext(sparkConf); final List<HopsProducer> sparkProducers = new ArrayList<>(); final DateFormat sdf = new SimpleDateFormat("yyyy/MM/dd HH:mm:ss:SSS"); final List<String> messages = new ArrayList(); final List<String> priorities = new ArrayList(); final List<String> loggers = new ArrayList(); /** * ********************************* Setup dummy test data *********************************** */ messages.add("Container container_e01_1494850115055_0016_01_000002 succeeded"); messages.add("Container container_e01_1494850115251_0015_01_000002 succeeded"); messages.add("rollingMonitorInterval is set as -1. The log rolling mornitoring interval is disabled. " + "The logs will be aggregated after this application is finished."); messages.add("rollingMonitorInterval is set as -1. The log rolling mornitoring interval is disabled. " + "The logs will be aggregated after this application is finished."); messages.add("Sending out 2 container statuses: " + "[ContainerStatus: [ContainerId: container_e01_1494850115055_0016_01_000001, State: RUNNING, " + "Diagnostics: , ExitStatus: -1000, ], " + "ContainerStatus: [ContainerId: container_e01_1494850115055_0016_01_000002, " + "State: RUNNING, Diagnostics: , ExitStatus: -1000, ]]"); messages.add("Node's health-status : true"); messages.add("Cannot create writer for app application_1494433225517_0008. Skip log upload this time."); priorities.add("INFO"); priorities.add("INFO"); priorities.add("WARN"); priorities.add("DEBUG"); priorities.add("DEBUG"); priorities.add("DEBUG"); priorities.add("ERROR"); loggers.add("org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainersMonitorImpl"); loggers.add("org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainersMonitorImpl"); loggers.add( "org.apache.hadoop.yarn.server.nodemanager.containermanager.logaggregation.AppLogAggregatorImpl"); loggers.add("org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdaterImpl"); loggers.add("org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdaterImpl"); loggers.add("org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdaterImpl"); loggers.add( "org.apache.hadoop.yarn.server.nodemanager.containermanager.logaggregation.AppLogAggregatorImpl"); //End setup dummy data //Get a broker for the producer LOG.log(Level.INFO, "Producing to:{0}", Hops.getBrokerEndpointsList().get(0)); Properties props = new Properties(); props.put("bootstrap.servers", Hops.getBrokerEndpointsList().get(0)); for (final String topic : topicsSet) { new Thread() { @Override public void run() { try { SparkProducer sparkProducer = Hops.getSparkProducer(topic, props); sparkProducers.add(sparkProducer); Map<String, String> message = new HashMap<>(); int i = 0; //Produce Kafka messages to topic while (true) { message.put("message", messages.get(i % messages.size())); message.put("priority", priorities.get(i % priorities.size())); message.put("logger", loggers.get(i % loggers.size())); Date date = new Date(); message.put("timestamp", sdf.format(date)); sparkProducer.produce(message); Thread.sleep(100); i++; } } catch (SchemaNotFoundException | CredentialsNotFoundException | InterruptedException ex) { LOG.log(Level.SEVERE, ex.getMessage(), ex); } } }.start(); } //Keep application running Hops.shutdownGracefully(jsc); for (HopsProducer hopsProducer : sparkProducers) { hopsProducer.close(); } //Consumer } else { // Create DataSet representing the stream of input lines from kafka DataStreamReader dsr = Hops.getSparkConsumer().getKafkaDataStreamReader(); Dataset<Row> lines = dsr.load(); // Generate running word count Dataset<LogEntry> logEntries = lines.map(new MapFunction<Row, LogEntry>() { @Override public LogEntry call(Row record) throws Exception { GenericRecord genericRecord = RECORD_INJECTIONS.entrySet().iterator().next().getValue() .invert(record.getAs("value")).get(); LogEntry logEntry = new LogEntry(genericRecord.get("timestamp").toString(), genericRecord.get("priority").toString(), genericRecord.get("logger").toString(), genericRecord.get("message").toString()); return logEntry; } }, Encoders.bean(LogEntry.class)); Dataset<String> logEntriesRaw = lines.map(new MapFunction<Row, String>() { @Override public String call(Row record) throws Exception { GenericRecord genericRecord = RECORD_INJECTIONS.entrySet().iterator().next().getValue() .invert(record.getAs("value")).get(); return genericRecord.toString(); } }, Encoders.STRING()); // Start running the query that prints the running counts to the console StreamingQuery queryFile = logEntries.writeStream().format("parquet") .option("path", "/Projects/" + Hops.getProjectName() + "/Resources/data-parquet-" + Hops.getAppId()) .option("checkpointLocation", "/Projects/" + Hops.getProjectName() + "/Resources/checkpoint-parquet-" + Hops.getAppId()) .trigger(Trigger.ProcessingTime(10000)).start(); StreamingQuery queryFile2 = logEntriesRaw.writeStream().format("text") .option("path", "/Projects/" + Hops.getProjectName() + "/Resources/data-text-" + Hops.getAppId()) .option("checkpointLocation", "/Projects/" + Hops.getProjectName() + "/Resources/checkpoint-text-" + Hops.getAppId()) .trigger(Trigger.ProcessingTime(10000)).start(); Hops.shutdownGracefully(queryFile); } }
From source file:my.first.sql.hive.JavaSparkHiveExample.java
License:Apache License
public static void main(String[] args) { // $example on:spark_hive$ // warehouseLocation points to the default location for managed databases and tables String warehouseLocation = "hdfs://192.168.1.26:9002/user/hive/warehouse";//"spark-warehouse"; SparkSession spark = SparkSession.builder().master("local")//.master("spark://192.168.1.26:7077")// .appName("Java Spark Hive Example").config("spark.sql.warehouse.dir", warehouseLocation) .enableHiveSupport().getOrCreate(); // spark.sparkContext().addJar("E:\\work\\workspace\\spark-study\\target\\spark-study-1.0-SNAPSHOT.jar");//,E:\\work\\soft\\apache-maven-3.3.9\\resp\\org\\apache\\spark\\spark-hive_2.11\\2.0.1\\spark-hive_2.11-2.0.1.jar"); spark.sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING)"); spark.sql("LOAD DATA LOCAL INPATH 'src/main/resources/kv1.txt' INTO TABLE src"); // Queries are expressed in HiveQL spark.sql("SELECT * FROM src").show(); // +---+-------+ // |key| value| // +---+-------+ // |238|val_238| // | 86| val_86| // |311|val_311| // .../*from w ww. j av a 2 s. c om*/ // Aggregation queries are also supported. spark.sql("SELECT COUNT(*) FROM src").show(); // +--------+ // |count(1)| // +--------+ // | 500 | // +--------+ // The results of SQL queries are themselves DataFrames and support all normal functions. Dataset<Row> sqlDF = spark.sql("SELECT key, value FROM src WHERE key < 10 ORDER BY key"); sqlDF.show(); // The items in DaraFrames are of type Row, which lets you to access each column by ordinal. Dataset<String> stringsDS = sqlDF.map(new MapFunction<Row, String>() { @Override public String call(Row row) throws Exception { return "Key: " + row.get(0) + ", Value: " + row.get(1); } }, Encoders.STRING()); stringsDS.show(); // +--------------------+ // | value| // +--------------------+ // |Key: 0, Value: val_0| // |Key: 0, Value: val_0| // |Key: 0, Value: val_0| // ... // You can also use DataFrames to create temporary views within a SparkSession. List<Record> records = new ArrayList<>(); for (int key = 1; key < 100; key++) { Record record = new Record(); record.setKey(key); record.setValue("val_" + key); records.add(record); } Dataset<Row> recordsDF = spark.createDataFrame(records, Record.class); recordsDF.createOrReplaceTempView("records"); // Queries can then join DataFrames data with data stored in Hive. spark.sql("SELECT * FROM records r JOIN src s ON r.key = s.key").show(); // +---+------+---+------+ // |key| value|key| value| // +---+------+---+------+ // | 2| val_2| 2| val_2| // | 2| val_2| 2| val_2| // | 4| val_4| 4| val_4| // ... // $example off:spark_hive$ spark.stop(); }
From source file:my.first.sql.JavaSparkSQLExample.java
License:Apache License
private static void runDatasetCreationExample(SparkSession spark) { // $example on:create_ds$ // Create an instance of a Bean class Person person = new Person(); person.setName("Andy"); person.setAge(32);//from ww w . j a va 2 s. com // Encoders are created for Java beans Encoder<Person> personEncoder = Encoders.bean(Person.class); Dataset<Person> javaBeanDS = spark.createDataset(Collections.singletonList(person), personEncoder); javaBeanDS.show(); // +---+----+ // |age|name| // +---+----+ // | 32|Andy| // +---+----+ // Encoders for most common types are provided in class Encoders Encoder<Integer> integerEncoder = Encoders.INT(); Dataset<Integer> primitiveDS = spark.createDataset(Arrays.asList(1, 2, 3), integerEncoder); Dataset<Integer> transformedDS = primitiveDS.map(new MapFunction<Integer, Integer>() { @Override public Integer call(Integer value) throws Exception { return value + 1; } }, integerEncoder); transformedDS.collect(); // Returns [2, 3, 4] // DataFrames can be converted to a Dataset by providing a class. Mapping based on name String path = jsonPath; Dataset<Person> peopleDS = spark.read().json(path).as(personEncoder); peopleDS.show(); // +----+-------+ // | age| name| // +----+-------+ // |null|Michael| // | 30| Andy| // | 19| Justin| // +----+-------+ // $example off:create_ds$ }