List of usage examples for org.apache.spark.api.java.function MapFunction MapFunction
MapFunction
From source file:my.first.sql.JavaSparkSQLExample.java
License:Apache License
private static void runInferSchemaExample(SparkSession spark) { // $example on:schema_inferring$ // Create an RDD of Person objects from a text file JavaRDD<Person> peopleRDD = spark.read().textFile(textPath).javaRDD().map(new Function<String, Person>() { @Override/*from w w w . j a va 2s . com*/ public Person call(String line) throws Exception { String[] parts = line.split(","); Person person = new Person(); person.setName(parts[0]); person.setAge(Integer.parseInt(parts[1].trim())); return person; } }); // Apply a schema to an RDD of JavaBeans to get a DataFrame Dataset<Row> peopleDF = spark.createDataFrame(peopleRDD, Person.class); // Register the DataFrame as a temporary view peopleDF.createOrReplaceTempView("people"); // SQL statements can be run by using the sql methods provided by spark Dataset<Row> teenagersDF = spark.sql("SELECT name FROM people WHERE age BETWEEN 13 AND 19"); // The columns of a row in the result can be accessed by field index Encoder<String> stringEncoder = Encoders.STRING(); Dataset<String> teenagerNamesByIndexDF = teenagersDF.map(new MapFunction<Row, String>() { @Override public String call(Row row) throws Exception { return "Name: " + row.getString(0); } }, stringEncoder); teenagerNamesByIndexDF.show(); // +------------+ // | value| // +------------+ // |Name: Justin| // +------------+ // or by field name Dataset<String> teenagerNamesByFieldDF = teenagersDF.map(new MapFunction<Row, String>() { @Override public String call(Row row) throws Exception { return "Name: " + row.<String>getAs("name"); } }, stringEncoder); teenagerNamesByFieldDF.show(); // +------------+ // | value| // +------------+ // |Name: Justin| // +------------+ // $example off:schema_inferring$ }
From source file:my.first.sql.JavaSparkSQLExample.java
License:Apache License
private static void runProgrammaticSchemaExample(SparkSession spark) { // $example on:programmatic_schema$ // Create an RDD JavaRDD<String> peopleRDD = spark.sparkContext().textFile(textPath, 1).toJavaRDD(); // The schema is encoded in a string String schemaString = "name age"; // Generate the schema based on the string of schema List<StructField> fields = new ArrayList<>(); for (String fieldName : schemaString.split(" ")) { StructField field = DataTypes.createStructField(fieldName, DataTypes.StringType, true); fields.add(field);/*from ww w .ja v a 2 s. co m*/ } StructType schema = DataTypes.createStructType(fields); // Convert records of the RDD (people) to Rows JavaRDD<Row> rowRDD = peopleRDD.map(new Function<String, Row>() { @Override public Row call(String record) throws Exception { String[] attributes = record.split(","); return RowFactory.create(attributes[0], attributes[1].trim()); } }); // Apply the schema to the RDD Dataset<Row> peopleDataFrame = spark.createDataFrame(rowRDD, schema); // Creates a temporary view using the DataFrame peopleDataFrame.createOrReplaceTempView("people"); // SQL can be run over a temporary view created using DataFrames Dataset<Row> results = spark.sql("SELECT name FROM people"); // The results of SQL queries are DataFrames and support all the normal RDD operations // The columns of a row in the result can be accessed by field index or by field name Dataset<String> namesDS = results.map(new MapFunction<Row, String>() { @Override public String call(Row row) throws Exception { return "Name: " + row.getString(0); } }, Encoders.STRING()); namesDS.show(); // +-------------+ // | value| // +-------------+ // |Name: Michael| // | Name: Andy| // | Name: Justin| // +-------------+ // $example off:programmatic_schema$ }
From source file:my.first.sql.JavaSQLDataSourceExample.java
License:Apache License
private static void runBasicParquetExample(SparkSession spark) { // $example on:basic_parquet_example$ Dataset<Row> peopleDF = spark.read().json("src/main/resources/people.json"); // DataFrames can be saved as Parquet files, maintaining the schema information peopleDF.write().parquet("people.parquet"); // Read in the Parquet file created above. // Parquet files are self-describing so the schema is preserved // The result of loading a parquet file is also a DataFrame Dataset<Row> parquetFileDF = spark.read().parquet("people.parquet"); // Parquet files can also be used to create a temporary view and then used in SQL statements parquetFileDF.createOrReplaceTempView("parquetFile"); Dataset<Row> namesDF = spark.sql("SELECT name FROM parquetFile WHERE age BETWEEN 13 AND 19"); Dataset<String> namesDS = namesDF.map(new MapFunction<Row, String>() { public String call(Row row) { return "Name: " + row.getString(0); }//from ww w .j a v a 2s . c om }, Encoders.STRING()); namesDS.show(); // +------------+ // | value| // +------------+ // |Name: Justin| // +------------+ // $example off:basic_parquet_example$ }
From source file:org.workshop.sql.JavaSparkSQLExample.java
License:Apache License
private static void runDatasetCreationExample(SparkSession spark) { // $example on:create_ds$ // Create an instance of a Bean class Person person = new Person(); person.setName("Andy"); person.setAge(32);/* w w w .j av a 2s .c o m*/ // Encoders are created for Java beans Encoder<Person> personEncoder = Encoders.bean(Person.class); Dataset<Person> javaBeanDS = spark.createDataset(Collections.singletonList(person), personEncoder); javaBeanDS.show(); // +---+----+ // |age|name| // +---+----+ // | 32|Andy| // +---+----+ // Encoders for most common types are provided in class Encoders Encoder<Integer> integerEncoder = Encoders.INT(); Dataset<Integer> primitiveDS = spark.createDataset(Arrays.asList(1, 2, 3), integerEncoder); Dataset<Integer> transformedDS = primitiveDS.map(new MapFunction<Integer, Integer>() { @Override public Integer call(Integer value) throws Exception { return value + 1; } }, integerEncoder); transformedDS.collect(); // Returns [2, 3, 4] // DataFrames can be converted to a Dataset by providing a class. Mapping based on name String path = "src/main/resources/people.json"; Dataset<Person> peopleDS = spark.read().json(path).as(personEncoder); peopleDS.show(); // +----+-------+ // | age| name| // +----+-------+ // |null|Michael| // | 30| Andy| // | 19| Justin| // +----+-------+ // $example off:create_ds$ }
From source file:org.workshop.sql.JavaSparkSQLExample.java
License:Apache License
private static void runInferSchemaExample(SparkSession spark) { // $example on:schema_inferring$ // Create an RDD of Person objects from a text file JavaRDD<Person> peopleRDD = spark.read().textFile("src/main/resources/people.txt").javaRDD() .map(new Function<String, Person>() { @Override/*from w w w . j a va 2s.c o m*/ public Person call(String line) throws Exception { String[] parts = line.split(","); Person person = new Person(); person.setName(parts[0]); person.setAge(Integer.parseInt(parts[1].trim())); return person; } }); // Apply a schema to an RDD of JavaBeans to get a DataFrame Dataset<Row> peopleDF = spark.createDataFrame(peopleRDD, Person.class); // Register the DataFrame as a temporary view peopleDF.createOrReplaceTempView("people"); // SQL statements can be run by using the sql methods provided by spark Dataset<Row> teenagersDF = spark.sql("SELECT name FROM people WHERE age BETWEEN 13 AND 19"); // The columns of a row in the result can be accessed by field index Encoder<String> stringEncoder = Encoders.STRING(); Dataset<String> teenagerNamesByIndexDF = teenagersDF.map(new MapFunction<Row, String>() { @Override public String call(Row row) throws Exception { return "Name: " + row.getString(0); } }, stringEncoder); teenagerNamesByIndexDF.show(); // +------------+ // | value| // +------------+ // |Name: Justin| // +------------+ // or by field name Dataset<String> teenagerNamesByFieldDF = teenagersDF.map(new MapFunction<Row, String>() { @Override public String call(Row row) throws Exception { return "Name: " + row.<String>getAs("name"); } }, stringEncoder); teenagerNamesByFieldDF.show(); // +------------+ // | value| // +------------+ // |Name: Justin| // +------------+ // $example off:schema_inferring$ }
From source file:org.workshop.sql.JavaSparkSQLExample.java
License:Apache License
private static void runProgrammaticSchemaExample(SparkSession spark) { // $example on:programmatic_schema$ // Create an RDD JavaRDD<String> peopleRDD = spark.sparkContext().textFile("src/main/resources/people.txt", 1).toJavaRDD(); // The schema is encoded in a string String schemaString = "name age"; // Generate the schema based on the string of schema List<StructField> fields = new ArrayList<>(); for (String fieldName : schemaString.split(" ")) { StructField field = DataTypes.createStructField(fieldName, DataTypes.StringType, true); fields.add(field);//w ww. j av a 2s . com } StructType schema = DataTypes.createStructType(fields); // Convert records of the RDD (people) to Rows JavaRDD<Row> rowRDD = peopleRDD.map(new Function<String, Row>() { @Override public Row call(String record) throws Exception { String[] attributes = record.split(","); return RowFactory.create(attributes[0], attributes[1].trim()); } }); // Apply the schema to the RDD Dataset<Row> peopleDataFrame = spark.createDataFrame(rowRDD, schema); // Creates a temporary view using the DataFrame peopleDataFrame.createOrReplaceTempView("people"); // SQL can be run over a temporary view created using DataFrames Dataset<Row> results = spark.sql("SELECT name FROM people"); // The results of SQL queries are DataFrames and support all the normal RDD operations // The columns of a row in the result can be accessed by field index or by field name Dataset<String> namesDS = results.map(new MapFunction<Row, String>() { @Override public String call(Row row) throws Exception { return "Name: " + row.getString(0); } }, Encoders.STRING()); namesDS.show(); // +-------------+ // | value| // +-------------+ // |Name: Michael| // | Name: Andy| // | Name: Justin| // +-------------+ // $example off:programmatic_schema$ }
From source file:sql.hive.JavaSparkHiveExample.java
License:Apache License
public static void main(String[] args) { // $example on:spark_hive$ // warehouseLocation points to the default location for managed databases and tables String warehouseLocation = "spark-warehouse"; SparkSession spark = SparkSession.builder().master("local[4]").appName("Java Spark Hive Example") .config("spark.sql.warehouse.dir", warehouseLocation).enableHiveSupport().getOrCreate(); spark.sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING)"); spark.sql("LOAD DATA LOCAL INPATH 'examples/src/main/resources/kv1.txt' INTO TABLE src"); // Queries are expressed in HiveQL spark.sql("SELECT * FROM src").show(); // +---+-------+ // |key| value| // +---+-------+ // |238|val_238| // | 86| val_86| // |311|val_311| // .../*from ww w . jav a2s. c o m*/ // Aggregation queries are also supported. spark.sql("SELECT COUNT(*) FROM src").show(); // +--------+ // |count(1)| // +--------+ // | 500 | // +--------+ // The results of SQL queries are themselves DataFrames and support all normal functions. Dataset<Row> sqlDF = spark.sql("SELECT key, value FROM src WHERE key < 10 ORDER BY key"); // The items in DaraFrames are of type Row, which lets you to access each column by ordinal. Dataset<String> stringsDS = sqlDF.map(new MapFunction<Row, String>() { @Override public String call(Row row) throws Exception { return "Key: " + row.get(0) + ", Value: " + row.get(1); } }, Encoders.STRING()); stringsDS.show(); // +--------------------+ // | value| // +--------------------+ // |Key: 0, Value: val_0| // |Key: 0, Value: val_0| // |Key: 0, Value: val_0| // ... // You can also use DataFrames to create temporary views within a SparkSession. List<Record> records = new ArrayList<>(); for (int key = 1; key < 100; key++) { Record record = new Record(); record.setKey(key); record.setValue("val_" + key); records.add(record); } Dataset<Row> recordsDF = spark.createDataFrame(records, Record.class); recordsDF.createOrReplaceTempView("records"); // Queries can then join DataFrames data with data stored in Hive. spark.sql("SELECT * FROM records r JOIN src s ON r.key = s.key").show(); // +---+------+---+------+ // |key| value|key| value| // +---+------+---+------+ // | 2| val_2| 2| val_2| // | 2| val_2| 2| val_2| // | 4| val_4| 4| val_4| // ... // $example off:spark_hive$ spark.stop(); }
From source file:sql.JavaSparkSQLExample.java
License:Apache License
private static void runDatasetCreationExample(SparkSession spark) { // $example on:create_ds$ // Create an instance of a Bean class Person person = new Person(); person.setName("Andy"); person.setAge(32);/* w w w . ja va 2 s .c o m*/ // Encoders are created for Java beans Encoder<Person> personEncoder = Encoders.bean(Person.class); Dataset<Person> javaBeanDS = spark.createDataset(Collections.singletonList(person), personEncoder); javaBeanDS.show(); // +---+----+ // |age|name| // +---+----+ // | 32|Andy| // +---+----+ // Encoders for most common types are provided in class Encoders Encoder<Integer> integerEncoder = Encoders.INT(); Dataset<Integer> primitiveDS = spark.createDataset(Arrays.asList(1, 2, 3), integerEncoder); Dataset<Integer> transformedDS = primitiveDS.map(new MapFunction<Integer, Integer>() { @Override public Integer call(Integer value) throws Exception { return value + 1; } }, integerEncoder); transformedDS.collect(); // Returns [2, 3, 4] transformedDS.show(); // DataFrames can be converted to a Dataset by providing a class. Mapping based on name String path = "/home/paul/spark/spark-2.1.0-bin-hadoop2.7/examples/src/main/resources/people.json"; Dataset<Person> peopleDS = spark.read().json(path).as(personEncoder); peopleDS.show(); // +----+-------+ // | age| name| // +----+-------+ // |null|Michael| // | 30| Andy| // | 19| Justin| // +----+-------+ // $example off:create_ds$ }
From source file:sql.JavaSparkSQLExample.java
License:Apache License
private static void runInferSchemaExample(SparkSession spark) { // $example on:schema_inferring$ // Create an RDD of Person objects from a text file JavaRDD<Person> peopleRDD = spark.read() .textFile("/home/paul/spark/spark-2.1.0-bin-hadoop2.7/examples/src/main/resources/people.txt") .javaRDD().map(new Function<String, Person>() { @Override//from www . j a v a 2 s . com public Person call(String line) throws Exception { String[] parts = line.split(","); Person person = new Person(); person.setName(parts[0]); person.setAge(Integer.parseInt(parts[1].trim())); return person; } }); // Apply a schema to an RDD of JavaBeans to get a DataFrame Dataset<Row> peopleDF = spark.createDataFrame(peopleRDD, Person.class); // Register the DataFrame as a temporary view peopleDF.createOrReplaceTempView("people"); // SQL statements can be run by using the sql methods provided by spark Dataset<Row> teenagersDF = spark.sql("SELECT name FROM people WHERE age BETWEEN 13 AND 19"); // The columns of a row in the result can be accessed by field index Encoder<String> stringEncoder = Encoders.STRING(); Dataset<String> teenagerNamesByIndexDF = teenagersDF.map(new MapFunction<Row, String>() { @Override public String call(Row row) throws Exception { return "Name: " + row.getString(0); } }, stringEncoder); teenagerNamesByIndexDF.show(); // +------------+ // | value| // +------------+ // |Name: Justin| // +------------+ // or by field name Dataset<String> teenagerNamesByFieldDF = teenagersDF.map(new MapFunction<Row, String>() { @Override public String call(Row row) throws Exception { return "Name: " + row.<String>getAs("name"); } }, stringEncoder); teenagerNamesByFieldDF.show(); // +------------+ // | value| // +------------+ // |Name: Justin| // +------------+ // $example off:schema_inferring$ }
From source file:sql.JavaSparkSQLExample.java
License:Apache License
private static void runProgrammaticSchemaExample(SparkSession spark) { // $example on:programmatic_schema$ // Create an RDD JavaRDD<String> peopleRDD = spark.sparkContext() .textFile("/home/paul/spark/spark-2.1.0-bin-hadoop2.7/examples/src/main/resources/people.txt", 1) .toJavaRDD();/*from w w w . j a v a2 s.c om*/ // The schema is encoded in a string String schemaString = "name age"; // Generate the schema based on the string of schema List<StructField> fields = new ArrayList<>(); for (String fieldName : schemaString.split(" ")) { StructField field = DataTypes.createStructField(fieldName, DataTypes.StringType, true); fields.add(field); } StructType schema = DataTypes.createStructType(fields); // Convert records of the RDD (people) to Rows JavaRDD<Row> rowRDD = peopleRDD.map(new Function<String, Row>() { @Override public Row call(String record) throws Exception { String[] attributes = record.split(","); return RowFactory.create(attributes[0], attributes[1].trim()); } }); // Apply the schema to the RDD Dataset<Row> peopleDataFrame = spark.createDataFrame(rowRDD, schema); // Creates a temporary view using the DataFrame peopleDataFrame.createOrReplaceTempView("people"); // SQL can be run over a temporary view created using DataFrames Dataset<Row> results = spark.sql("SELECT name FROM people"); // The results of SQL queries are DataFrames and support all the normal RDD operations // The columns of a row in the result can be accessed by field index or by field name Dataset<String> namesDS = results.map(new MapFunction<Row, String>() { @Override public String call(Row row) throws Exception { return "Name: " + row.getString(0); } }, Encoders.STRING()); namesDS.show(); // +-------------+ // | value| // +-------------+ // |Name: Michael| // | Name: Andy| // | Name: Justin| // +-------------+ // $example off:programmatic_schema$ }