Example usage for org.apache.spark.api.java.function MapFunction MapFunction

Introduction

In this page you can find the example usage for org.apache.spark.api.java.function MapFunction MapFunction.

Prototype

MapFunction

Source Link

Usage

From source file:com.andado.spark.examples.sql.hive.JavaSparkHiveExample.java

License:Apache License

public static void main(String[] args) {
    // $example on:spark_hive$
    // warehouseLocation points to the default location for managed databases and tables
    String warehouseLocation = "spark-warehouse";
    SparkSession spark = SparkSession.builder().appName("Java Spark Hive Example")
            .config("spark.sql.warehouse.dir", warehouseLocation).enableHiveSupport().getOrCreate();

    spark.sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING)");
    spark.sql("LOAD DATA LOCAL INPATH 'examples/src/main/resources/kv1.txt' INTO TABLE src");

    // Queries are expressed in HiveQL
    spark.sql("SELECT * FROM src").show();
    // +---+-------+
    // |key|  value|
    // +---+-------+
    // |238|val_238|
    // | 86| val_86|
    // |311|val_311|
    // ...//w w  w  . j  av a 2 s  .  c o  m

    // Aggregation queries are also supported.
    spark.sql("SELECT COUNT(*) FROM src").show();
    // +--------+
    // |count(1)|
    // +--------+
    // |    500 |
    // +--------+

    // The results of SQL queries are themselves DataFrames and support all normal functions.
    Dataset<Row> sqlDF = spark.sql("SELECT key, value FROM src WHERE key < 10 ORDER BY key");

    // The items in DaraFrames are of type Row, which lets you to access each column by ordinal.
    Dataset<String> stringsDS = sqlDF.map(new MapFunction<Row, String>() {
        @Override
        public String call(Row row) throws Exception {
            return "Key: " + row.get(0) + ", Value: " + row.get(1);
        }
    }, Encoders.STRING());
    stringsDS.show();
    // +--------------------+
    // |               value|
    // +--------------------+
    // |Key: 0, Value: val_0|
    // |Key: 0, Value: val_0|
    // |Key: 0, Value: val_0|
    // ...

    // You can also use DataFrames to create temporary views within a SparkSession.
    List<Record> records = new ArrayList<>();
    for (int key = 1; key < 100; key++) {
        Record record = new Record();
        record.setKey(key);
        record.setValue("val_" + key);
        records.add(record);
    }
    Dataset<Row> recordsDF = spark.createDataFrame(records, Record.class);
    recordsDF.createOrReplaceTempView("records");

    // Queries can then join DataFrames data with data stored in Hive.
    spark.sql("SELECT * FROM records r JOIN src s ON r.key = s.key").show();
    // +---+------+---+------+
    // |key| value|key| value|
    // +---+------+---+------+
    // |  2| val_2|  2| val_2|
    // |  2| val_2|  2| val_2|
    // |  4| val_4|  4| val_4|
    // ...
    // $example off:spark_hive$

    spark.stop();
}

From source file:com.andado.spark.examples.sql.JavaSparkSQLExample.java

License:Apache License

private static void runDatasetCreationExample(SparkSession spark) {
    // $example on:create_ds$
    // Create an instance of a Bean class
    Person person = new Person();
    person.setName("Andy");
    person.setAge(32);/*  www  . j  av  a  2 s  .  c o  m*/

    // Encoders are created for Java beans
    Encoder<Person> personEncoder = Encoders.bean(Person.class);
    Dataset<Person> javaBeanDS = spark.createDataset(Collections.singletonList(person), personEncoder);
    javaBeanDS.show();
    // +---+----+
    // |age|name|
    // +---+----+
    // | 32|Andy|
    // +---+----+

    // Encoders for most common types are provided in class Encoders
    Encoder<Integer> integerEncoder = Encoders.INT();
    Dataset<Integer> primitiveDS = spark.createDataset(Arrays.asList(1, 2, 3), integerEncoder);
    Dataset<Integer> transformedDS = primitiveDS.map(new MapFunction<Integer, Integer>() {
        @Override
        public Integer call(Integer value) throws Exception {
            return value + 1;
        }
    }, integerEncoder);
    transformedDS.collect(); // Returns [2, 3, 4]

    // DataFrames can be converted to a Dataset by providing a class. Mapping based on name
    String path = "examples/src/main/resources/people.json";
    Dataset<Person> peopleDS = spark.read().json(path).as(personEncoder);
    peopleDS.show();
    // +----+-------+
    // | age|   name|
    // +----+-------+
    // |null|Michael|
    // |  30|   Andy|
    // |  19| Justin|
    // +----+-------+
    // $example off:create_ds$
}

From source file:com.andado.spark.examples.sql.JavaSparkSQLExample.java

License:Apache License

private static void runInferSchemaExample(SparkSession spark) {
    // $example on:schema_inferring$
    // Create an RDD of Person objects from a text file
    JavaRDD<Person> peopleRDD = spark.read().textFile("examples/src/main/resources/people.txt").javaRDD()
            .map(new Function<String, Person>() {
                @Override// w w w .  j ava 2 s.c om
                public Person call(String line) throws Exception {
                    String[] parts = line.split(",");
                    Person person = new Person();
                    person.setName(parts[0]);
                    person.setAge(Integer.parseInt(parts[1].trim()));
                    return person;
                }
            });

    // Apply a schema to an RDD of JavaBeans to get a DataFrame
    Dataset<Row> peopleDF = spark.createDataFrame(peopleRDD, Person.class);
    // Register the DataFrame as a temporary view
    peopleDF.createOrReplaceTempView("people");

    // SQL statements can be run by using the sql methods provided by spark
    Dataset<Row> teenagersDF = spark.sql("SELECT name FROM people WHERE age BETWEEN 13 AND 19");

    // The columns of a row in the result can be accessed by field index
    Encoder<String> stringEncoder = Encoders.STRING();
    Dataset<String> teenagerNamesByIndexDF = teenagersDF.map(new MapFunction<Row, String>() {
        @Override
        public String call(Row row) throws Exception {
            return "Name: " + row.getString(0);
        }
    }, stringEncoder);
    teenagerNamesByIndexDF.show();
    // +------------+
    // |       value|
    // +------------+
    // |Name: Justin|
    // +------------+

    // or by field name
    Dataset<String> teenagerNamesByFieldDF = teenagersDF.map(new MapFunction<Row, String>() {
        @Override
        public String call(Row row) throws Exception {
            return "Name: " + row.<String>getAs("name");
        }
    }, stringEncoder);
    teenagerNamesByFieldDF.show();
    // +------------+
    // |       value|
    // +------------+
    // |Name: Justin|
    // +------------+
    // $example off:schema_inferring$
}

From source file:com.andado.spark.examples.sql.JavaSparkSQLExample.java

License:Apache License

private static void runProgrammaticSchemaExample(SparkSession spark) {
    // $example on:programmatic_schema$
    // Create an RDD
    JavaRDD<String> peopleRDD = spark.sparkContext().textFile("examples/src/main/resources/people.txt", 1)
            .toJavaRDD();/*  w w  w.  ja v a 2s.com*/

    // The schema is encoded in a string
    String schemaString = "name age";

    // Generate the schema based on the string of schema
    List<StructField> fields = new ArrayList<>();
    for (String fieldName : schemaString.split(" ")) {
        StructField field = DataTypes.createStructField(fieldName, DataTypes.StringType, true);
        fields.add(field);
    }
    StructType schema = DataTypes.createStructType(fields);

    // Convert records of the RDD (people) to Rows
    JavaRDD<Row> rowRDD = peopleRDD.map(new Function<String, Row>() {
        @Override
        public Row call(String record) throws Exception {
            String[] attributes = record.split(",");
            return RowFactory.create(attributes[0], attributes[1].trim());
        }
    });

    // Apply the schema to the RDD
    Dataset<Row> peopleDataFrame = spark.createDataFrame(rowRDD, schema);

    // Creates a temporary view using the DataFrame
    peopleDataFrame.createOrReplaceTempView("people");

    // SQL can be run over a temporary view created using DataFrames
    Dataset<Row> results = spark.sql("SELECT name FROM people");

    // The results of SQL queries are DataFrames and support all the normal RDD operations
    // The columns of a row in the result can be accessed by field index or by field name
    Dataset<String> namesDS = results.map(new MapFunction<Row, String>() {
        @Override
        public String call(Row row) throws Exception {
            return "Name: " + row.getString(0);
        }
    }, Encoders.STRING());
    namesDS.show();
    // +-------------+
    // |        value|
    // +-------------+
    // |Name: Michael|
    // |   Name: Andy|
    // | Name: Justin|
    // +-------------+
    // $example off:programmatic_schema$
}

From source file:com.andado.spark.examples.sql.JavaSQLDataSourceExample.java

License:Apache License

private static void runBasicParquetExample(SparkSession spark) {
    // $example on:basic_parquet_example$
    Dataset<Row> peopleDF = spark.read().json("examples/src/main/resources/people.json");

    // DataFrames can be saved as Parquet files, maintaining the schema information
    peopleDF.write().parquet("people.parquet");

    // Read in the Parquet file created above.
    // Parquet files are self-describing so the schema is preserved
    // The result of loading a parquet file is also a DataFrame
    Dataset<Row> parquetFileDF = spark.read().parquet("people.parquet");

    // Parquet files can also be used to create a temporary view and then used in SQL statements
    parquetFileDF.createOrReplaceTempView("parquetFile");
    Dataset<Row> namesDF = spark.sql("SELECT name FROM parquetFile WHERE age BETWEEN 13 AND 19");
    Dataset<String> namesDS = namesDF.map(new MapFunction<Row, String>() {
        public String call(Row row) {
            return "Name: " + row.getString(0);
        }//from   w  w w.  jav a2 s .  c  o  m
    }, Encoders.STRING());
    namesDS.show();
    // +------------+
    // |       value|
    // +------------+
    // |Name: Justin|
    // +------------+
    // $example off:basic_parquet_example$
}

From source file:dbx.compute.spark.jobs.sql.hive.JavaSparkHiveExample.java

License:Apache License

public static void main(String[] args) {
    // $example on:spark_hive$
    // warehouseLocation points to the default location for managed databases and tables
    String warehouseLocation = "file:" + System.getProperty("user.dir") + "spark-warehouse";
    SparkSession spark = SparkSession.builder().appName("Java Spark Hive Example")
            .config("spark.sql.warehouse.dir", warehouseLocation).enableHiveSupport().getOrCreate();

    spark.sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING)");
    spark.sql("LOAD DATA LOCAL INPATH 'examples/src/main/resources/kv1.txt' INTO TABLE src");

    // Queries are expressed in HiveQL
    spark.sql("SELECT * FROM src").show();
    // +---+-------+
    // |key|  value|
    // +---+-------+
    // |238|val_238|
    // | 86| val_86|
    // |311|val_311|
    // .../*  w w w .  j a v  a 2  s. c  om*/

    // Aggregation queries are also supported.
    spark.sql("SELECT COUNT(*) FROM src").show();
    // +--------+
    // |count(1)|
    // +--------+
    // |    500 |
    // +--------+

    // The results of SQL queries are themselves DataFrames and support all normal functions.
    Dataset<Row> sqlDF = spark.sql("SELECT key, value FROM src WHERE key < 10 ORDER BY key");

    // The items in DaraFrames are of type Row, which lets you to access each column by ordinal.
    Dataset<String> stringsDS = sqlDF.map(new MapFunction<Row, String>() {
        public String call(Row row) throws Exception {
            return "Key: " + row.get(0) + ", Value: " + row.get(1);
        }
    }, Encoders.STRING());
    stringsDS.show();
    // +--------------------+
    // |               value|
    // +--------------------+
    // |Key: 0, Value: val_0|
    // |Key: 0, Value: val_0|
    // |Key: 0, Value: val_0|
    // ...

    // You can also use DataFrames to create temporary views within a HiveContext.
    List<Record> records = new ArrayList<Record>();
    for (int key = 1; key < 100; key++) {
        Record record = new Record();
        record.setKey(key);
        record.setValue("val_" + key);
        records.add(record);
    }
    Dataset<Row> recordsDF = spark.createDataFrame(records, Record.class);
    recordsDF.createOrReplaceTempView("records");

    // Queries can then join DataFrames data with data stored in Hive.
    spark.sql("SELECT * FROM records r JOIN src s ON r.key = s.key").show();
    // +---+------+---+------+
    // |key| value|key| value|
    // +---+------+---+------+
    // |  2| val_2|  2| val_2|
    // |  2| val_2|  2| val_2|
    // |  4| val_4|  4| val_4|
    // ...
    // $example off:spark_hive$

    spark.stop();
}

From source file:gtl.spark.java.example.apache.sql.streaming.JavaStructuredSessionization.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (args.length < 2) {
        System.err.println("Usage: JavaStructuredSessionization <hostname> <port>");
        System.exit(1);/*w  w w .j  a v a  2 s. c om*/
    }

    String host = args[0];
    int port = Integer.parseInt(args[1]);

    SparkSession spark = SparkSession.builder().appName("JavaStructuredSessionization").getOrCreate();

    // Create DataFrame representing the stream of input lines from connection to host:port
    Dataset<Row> lines = spark.readStream().format("socket").option("host", host).option("port", port)
            .option("includeTimestamp", true).load();

    FlatMapFunction<LineWithTimestamp, Event> linesToEvents = new FlatMapFunction<LineWithTimestamp, Event>() {
        @Override
        public Iterator<Event> call(LineWithTimestamp lineWithTimestamp) throws Exception {
            ArrayList<Event> eventList = new ArrayList<Event>();
            for (String word : lineWithTimestamp.getLine().split(" ")) {
                eventList.add(new Event(word, lineWithTimestamp.getTimestamp()));
            }
            return eventList.iterator();
        }
    };

    // Split the lines into words, treat words as sessionId of events
    Dataset<Event> events = lines.withColumnRenamed("value", "line").as(Encoders.bean(LineWithTimestamp.class))
            .flatMap(linesToEvents, Encoders.bean(Event.class));

    // Sessionize the events. Track number of events, start and end timestamps of session, and
    // and report session updates.
    //
    // Step 1: Define the state update function
    MapGroupsWithStateFunction<String, Event, SessionInfo, SessionUpdate> stateUpdateFunc = new MapGroupsWithStateFunction<String, Event, SessionInfo, SessionUpdate>() {
        @Override
        public SessionUpdate call(String sessionId, Iterator<Event> events, GroupState<SessionInfo> state)
                throws Exception {
            // If timed out, then remove session and send final update
            if (state.hasTimedOut()) {
                SessionUpdate finalUpdate = new SessionUpdate(sessionId, state.get().calculateDuration(),
                        state.get().getNumEvents(), true);
                state.remove();
                return finalUpdate;

            } else {
                // Find max and min timestamps in events
                long maxTimestampMs = Long.MIN_VALUE;
                long minTimestampMs = Long.MAX_VALUE;
                int numNewEvents = 0;
                while (events.hasNext()) {
                    Event e = events.next();
                    long timestampMs = e.getTimestamp().getTime();
                    maxTimestampMs = Math.max(timestampMs, maxTimestampMs);
                    minTimestampMs = Math.min(timestampMs, minTimestampMs);
                    numNewEvents += 1;
                }
                SessionInfo updatedSession = new SessionInfo();

                // Update start and end timestamps in session
                if (state.exists()) {
                    SessionInfo oldSession = state.get();
                    updatedSession.setNumEvents(oldSession.numEvents + numNewEvents);
                    updatedSession.setStartTimestampMs(oldSession.startTimestampMs);
                    updatedSession.setEndTimestampMs(Math.max(oldSession.endTimestampMs, maxTimestampMs));
                } else {
                    updatedSession.setNumEvents(numNewEvents);
                    updatedSession.setStartTimestampMs(minTimestampMs);
                    updatedSession.setEndTimestampMs(maxTimestampMs);
                }
                state.update(updatedSession);
                // Set timeout such that the session will be expired if no data received for 10 seconds
                state.setTimeoutDuration("10 seconds");
                return new SessionUpdate(sessionId, state.get().calculateDuration(), state.get().getNumEvents(),
                        false);
            }
        }
    };

    // Step 2: Apply the state update function to the events streaming Dataset grouped by sessionId
    Dataset<SessionUpdate> sessionUpdates = events.groupByKey(new MapFunction<Event, String>() {
        @Override
        public String call(Event event) throws Exception {
            return event.getSessionId();
        }
    }, Encoders.STRING()).mapGroupsWithState(stateUpdateFunc, Encoders.bean(SessionInfo.class),
            Encoders.bean(SessionUpdate.class), GroupStateTimeout.ProcessingTimeTimeout());

    // Start running the query that prints the session updates to the console
    StreamingQuery query = sessionUpdates.writeStream().outputMode("update").format("console").start();

    query.awaitTermination();
}

From source file:io.hops.examples.spark.kafka.StructuredStreamingKafka.java

License:Apache License

public static void main(String[] args) throws StreamingQueryException, InterruptedException {
    final String type = args[0];
    //Producer// www  .  j  a  v  a2s . c o m
    if (!Strings.isNullOrEmpty(type) && type.equalsIgnoreCase("producer")) {
        Set<String> topicsSet = new HashSet<>(Hops.getTopics());
        SparkConf sparkConf = new SparkConf().setAppName(Hops.getJobName());
        JavaSparkContext jsc = new JavaSparkContext(sparkConf);
        final List<HopsProducer> sparkProducers = new ArrayList<>();
        final DateFormat sdf = new SimpleDateFormat("yyyy/MM/dd HH:mm:ss:SSS");
        final List<String> messages = new ArrayList();
        final List<String> priorities = new ArrayList();
        final List<String> loggers = new ArrayList();

        /**
         * ********************************* Setup dummy test data ***********************************
         */
        messages.add("Container container_e01_1494850115055_0016_01_000002 succeeded");
        messages.add("Container container_e01_1494850115251_0015_01_000002 succeeded");
        messages.add("rollingMonitorInterval is set as -1. The log rolling mornitoring interval is disabled. "
                + "The logs will be aggregated after this application is finished.");
        messages.add("rollingMonitorInterval is set as -1. The log rolling mornitoring interval is disabled. "
                + "The logs will be aggregated after this application is finished.");
        messages.add("Sending out 2 container statuses: "
                + "[ContainerStatus: [ContainerId: container_e01_1494850115055_0016_01_000001, State: RUNNING, "
                + "Diagnostics: , ExitStatus: -1000, ], "
                + "ContainerStatus: [ContainerId: container_e01_1494850115055_0016_01_000002, "
                + "State: RUNNING, Diagnostics: , ExitStatus: -1000, ]]");
        messages.add("Node's health-status : true");
        messages.add("Cannot create writer for app application_1494433225517_0008. Skip log upload this time.");
        priorities.add("INFO");
        priorities.add("INFO");
        priorities.add("WARN");
        priorities.add("DEBUG");
        priorities.add("DEBUG");
        priorities.add("DEBUG");
        priorities.add("ERROR");
        loggers.add("org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainersMonitorImpl");
        loggers.add("org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainersMonitorImpl");
        loggers.add(
                "org.apache.hadoop.yarn.server.nodemanager.containermanager.logaggregation.AppLogAggregatorImpl");
        loggers.add("org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdaterImpl");
        loggers.add("org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdaterImpl");
        loggers.add("org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdaterImpl");
        loggers.add(
                "org.apache.hadoop.yarn.server.nodemanager.containermanager.logaggregation.AppLogAggregatorImpl");
        //End setup dummy data

        //Get a broker for the producer
        LOG.log(Level.INFO, "Producing to:{0}", Hops.getBrokerEndpointsList().get(0));
        Properties props = new Properties();
        props.put("bootstrap.servers", Hops.getBrokerEndpointsList().get(0));
        for (final String topic : topicsSet) {
            new Thread() {
                @Override
                public void run() {
                    try {
                        SparkProducer sparkProducer = Hops.getSparkProducer(topic, props);
                        sparkProducers.add(sparkProducer);
                        Map<String, String> message = new HashMap<>();
                        int i = 0;
                        //Produce Kafka messages to topic
                        while (true) {
                            message.put("message", messages.get(i % messages.size()));
                            message.put("priority", priorities.get(i % priorities.size()));
                            message.put("logger", loggers.get(i % loggers.size()));
                            Date date = new Date();
                            message.put("timestamp", sdf.format(date));
                            sparkProducer.produce(message);
                            Thread.sleep(100);
                            i++;
                        }
                    } catch (SchemaNotFoundException | CredentialsNotFoundException | InterruptedException ex) {
                        LOG.log(Level.SEVERE, ex.getMessage(), ex);
                    }
                }
            }.start();
        } //Keep application running
        Hops.shutdownGracefully(jsc);
        for (HopsProducer hopsProducer : sparkProducers) {
            hopsProducer.close();
        }
        //Consumer
    } else {
        // Create DataSet representing the stream of input lines from kafka
        DataStreamReader dsr = Hops.getSparkConsumer().getKafkaDataStreamReader();
        Dataset<Row> lines = dsr.load();

        // Generate running word count
        Dataset<LogEntry> logEntries = lines.map(new MapFunction<Row, LogEntry>() {
            @Override
            public LogEntry call(Row record) throws Exception {
                GenericRecord genericRecord = RECORD_INJECTIONS.entrySet().iterator().next().getValue()
                        .invert(record.getAs("value")).get();
                LogEntry logEntry = new LogEntry(genericRecord.get("timestamp").toString(),
                        genericRecord.get("priority").toString(), genericRecord.get("logger").toString(),
                        genericRecord.get("message").toString());
                return logEntry;
            }
        }, Encoders.bean(LogEntry.class));

        Dataset<String> logEntriesRaw = lines.map(new MapFunction<Row, String>() {
            @Override
            public String call(Row record) throws Exception {
                GenericRecord genericRecord = RECORD_INJECTIONS.entrySet().iterator().next().getValue()
                        .invert(record.getAs("value")).get();

                return genericRecord.toString();
            }
        }, Encoders.STRING());

        // Start running the query that prints the running counts to the console
        StreamingQuery queryFile = logEntries.writeStream().format("parquet")
                .option("path",
                        "/Projects/" + Hops.getProjectName() + "/Resources/data-parquet-" + Hops.getAppId())
                .option("checkpointLocation", "/Projects/" + Hops.getProjectName()
                        + "/Resources/checkpoint-parquet-" + Hops.getAppId())
                .trigger(Trigger.ProcessingTime(10000)).start();

        StreamingQuery queryFile2 = logEntriesRaw.writeStream().format("text")
                .option("path",
                        "/Projects/" + Hops.getProjectName() + "/Resources/data-text-" + Hops.getAppId())
                .option("checkpointLocation",
                        "/Projects/" + Hops.getProjectName() + "/Resources/checkpoint-text-" + Hops.getAppId())
                .trigger(Trigger.ProcessingTime(10000)).start();

        Hops.shutdownGracefully(queryFile);
    }
}

From source file:my.first.sql.hive.JavaSparkHiveExample.java

License:Apache License

public static void main(String[] args) {
    // $example on:spark_hive$
    // warehouseLocation points to the default location for managed databases and tables
    String warehouseLocation = "hdfs://192.168.1.26:9002/user/hive/warehouse";//"spark-warehouse";
    SparkSession spark = SparkSession.builder().master("local")//.master("spark://192.168.1.26:7077")//
            .appName("Java Spark Hive Example").config("spark.sql.warehouse.dir", warehouseLocation)
            .enableHiveSupport().getOrCreate();
    //    spark.sparkContext().addJar("E:\\work\\workspace\\spark-study\\target\\spark-study-1.0-SNAPSHOT.jar");//,E:\\work\\soft\\apache-maven-3.3.9\\resp\\org\\apache\\spark\\spark-hive_2.11\\2.0.1\\spark-hive_2.11-2.0.1.jar");

    spark.sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING)");
    spark.sql("LOAD DATA LOCAL INPATH 'src/main/resources/kv1.txt' INTO TABLE src");

    // Queries are expressed in HiveQL
    spark.sql("SELECT * FROM src").show();
    // +---+-------+
    // |key|  value|
    // +---+-------+
    // |238|val_238|
    // | 86| val_86|
    // |311|val_311|
    // .../*from  w ww.  j av a  2  s. c  om*/

    // Aggregation queries are also supported.
    spark.sql("SELECT COUNT(*) FROM src").show();
    // +--------+
    // |count(1)|
    // +--------+
    // |    500 |
    // +--------+

    // The results of SQL queries are themselves DataFrames and support all normal functions.
    Dataset<Row> sqlDF = spark.sql("SELECT key, value FROM src WHERE key < 10 ORDER BY key");
    sqlDF.show();

    // The items in DaraFrames are of type Row, which lets you to access each column by ordinal.
    Dataset<String> stringsDS = sqlDF.map(new MapFunction<Row, String>() {
        @Override
        public String call(Row row) throws Exception {
            return "Key: " + row.get(0) + ", Value: " + row.get(1);
        }
    }, Encoders.STRING());
    stringsDS.show();
    // +--------------------+
    // |               value|
    // +--------------------+
    // |Key: 0, Value: val_0|
    // |Key: 0, Value: val_0|
    // |Key: 0, Value: val_0|
    // ...

    // You can also use DataFrames to create temporary views within a SparkSession.
    List<Record> records = new ArrayList<>();
    for (int key = 1; key < 100; key++) {
        Record record = new Record();
        record.setKey(key);
        record.setValue("val_" + key);
        records.add(record);
    }
    Dataset<Row> recordsDF = spark.createDataFrame(records, Record.class);
    recordsDF.createOrReplaceTempView("records");

    // Queries can then join DataFrames data with data stored in Hive.
    spark.sql("SELECT * FROM records r JOIN src s ON r.key = s.key").show();
    // +---+------+---+------+
    // |key| value|key| value|
    // +---+------+---+------+
    // |  2| val_2|  2| val_2|
    // |  2| val_2|  2| val_2|
    // |  4| val_4|  4| val_4|
    // ...
    // $example off:spark_hive$

    spark.stop();
}

From source file:my.first.sql.JavaSparkSQLExample.java

License:Apache License

private static void runDatasetCreationExample(SparkSession spark) {
    // $example on:create_ds$
    // Create an instance of a Bean class
    Person person = new Person();
    person.setName("Andy");
    person.setAge(32);//from ww w .  j a va  2 s.  com

    // Encoders are created for Java beans
    Encoder<Person> personEncoder = Encoders.bean(Person.class);
    Dataset<Person> javaBeanDS = spark.createDataset(Collections.singletonList(person), personEncoder);
    javaBeanDS.show();
    // +---+----+
    // |age|name|
    // +---+----+
    // | 32|Andy|
    // +---+----+

    // Encoders for most common types are provided in class Encoders
    Encoder<Integer> integerEncoder = Encoders.INT();
    Dataset<Integer> primitiveDS = spark.createDataset(Arrays.asList(1, 2, 3), integerEncoder);
    Dataset<Integer> transformedDS = primitiveDS.map(new MapFunction<Integer, Integer>() {
        @Override
        public Integer call(Integer value) throws Exception {
            return value + 1;
        }
    }, integerEncoder);
    transformedDS.collect(); // Returns [2, 3, 4]

    // DataFrames can be converted to a Dataset by providing a class. Mapping based on name
    String path = jsonPath;
    Dataset<Person> peopleDS = spark.read().json(path).as(personEncoder);
    peopleDS.show();
    // +----+-------+
    // | age|   name|
    // +----+-------+
    // |null|Michael|
    // |  30|   Andy|
    // |  19| Justin|
    // +----+-------+
    // $example off:create_ds$
}