Example usage for org.apache.spark.api.java.function MapFunction MapFunction

List of usage examples for org.apache.spark.api.java.function MapFunction MapFunction

Introduction

In this page you can find the example usage for org.apache.spark.api.java.function MapFunction MapFunction.

Prototype

MapFunction

Source Link

Usage

From source file:com.andado.spark.examples.sql.hive.JavaSparkHiveExample.java

License:Apache License

public static void main(String[] args) {
    // $example on:spark_hive$
    // warehouseLocation points to the default location for managed databases and tables
    String warehouseLocation = "spark-warehouse";
    SparkSession spark = SparkSession.builder().appName("Java Spark Hive Example")
            .config("spark.sql.warehouse.dir", warehouseLocation).enableHiveSupport().getOrCreate();

    spark.sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING)");
    spark.sql("LOAD DATA LOCAL INPATH 'examples/src/main/resources/kv1.txt' INTO TABLE src");

    // Queries are expressed in HiveQL
    spark.sql("SELECT * FROM src").show();
    // +---+-------+
    // |key|  value|
    // +---+-------+
    // |238|val_238|
    // | 86| val_86|
    // |311|val_311|
    // ...//w w  w  . j  av a 2 s  .  c o  m

    // Aggregation queries are also supported.
    spark.sql("SELECT COUNT(*) FROM src").show();
    // +--------+
    // |count(1)|
    // +--------+
    // |    500 |
    // +--------+

    // The results of SQL queries are themselves DataFrames and support all normal functions.
    Dataset<Row> sqlDF = spark.sql("SELECT key, value FROM src WHERE key < 10 ORDER BY key");

    // The items in DaraFrames are of type Row, which lets you to access each column by ordinal.
    Dataset<String> stringsDS = sqlDF.map(new MapFunction<Row, String>() {
        @Override
        public String call(Row row) throws Exception {
            return "Key: " + row.get(0) + ", Value: " + row.get(1);
        }
    }, Encoders.STRING());
    stringsDS.show();
    // +--------------------+
    // |               value|
    // +--------------------+
    // |Key: 0, Value: val_0|
    // |Key: 0, Value: val_0|
    // |Key: 0, Value: val_0|
    // ...

    // You can also use DataFrames to create temporary views within a SparkSession.
    List<Record> records = new ArrayList<>();
    for (int key = 1; key < 100; key++) {
        Record record = new Record();
        record.setKey(key);
        record.setValue("val_" + key);
        records.add(record);
    }
    Dataset<Row> recordsDF = spark.createDataFrame(records, Record.class);
    recordsDF.createOrReplaceTempView("records");

    // Queries can then join DataFrames data with data stored in Hive.
    spark.sql("SELECT * FROM records r JOIN src s ON r.key = s.key").show();
    // +---+------+---+------+
    // |key| value|key| value|
    // +---+------+---+------+
    // |  2| val_2|  2| val_2|
    // |  2| val_2|  2| val_2|
    // |  4| val_4|  4| val_4|
    // ...
    // $example off:spark_hive$

    spark.stop();
}

From source file:com.andado.spark.examples.sql.JavaSparkSQLExample.java

License:Apache License

private static void runDatasetCreationExample(SparkSession spark) {
    // $example on:create_ds$
    // Create an instance of a Bean class
    Person person = new Person();
    person.setName("Andy");
    person.setAge(32);/*  www  . j  av  a  2 s  .  c o  m*/

    // Encoders are created for Java beans
    Encoder<Person> personEncoder = Encoders.bean(Person.class);
    Dataset<Person> javaBeanDS = spark.createDataset(Collections.singletonList(person), personEncoder);
    javaBeanDS.show();
    // +---+----+
    // |age|name|
    // +---+----+
    // | 32|Andy|
    // +---+----+

    // Encoders for most common types are provided in class Encoders
    Encoder<Integer> integerEncoder = Encoders.INT();
    Dataset<Integer> primitiveDS = spark.createDataset(Arrays.asList(1, 2, 3), integerEncoder);
    Dataset<Integer> transformedDS = primitiveDS.map(new MapFunction<Integer, Integer>() {
        @Override
        public Integer call(Integer value) throws Exception {
            return value + 1;
        }
    }, integerEncoder);
    transformedDS.collect(); // Returns [2, 3, 4]

    // DataFrames can be converted to a Dataset by providing a class. Mapping based on name
    String path = "examples/src/main/resources/people.json";
    Dataset<Person> peopleDS = spark.read().json(path).as(personEncoder);
    peopleDS.show();
    // +----+-------+
    // | age|   name|
    // +----+-------+
    // |null|Michael|
    // |  30|   Andy|
    // |  19| Justin|
    // +----+-------+
    // $example off:create_ds$
}

From source file:com.andado.spark.examples.sql.JavaSparkSQLExample.java

License:Apache License

private static void runInferSchemaExample(SparkSession spark) {
    // $example on:schema_inferring$
    // Create an RDD of Person objects from a text file
    JavaRDD<Person> peopleRDD = spark.read().textFile("examples/src/main/resources/people.txt").javaRDD()
            .map(new Function<String, Person>() {
                @Override// w w w .  j ava 2 s.c om
                public Person call(String line) throws Exception {
                    String[] parts = line.split(",");
                    Person person = new Person();
                    person.setName(parts[0]);
                    person.setAge(Integer.parseInt(parts[1].trim()));
                    return person;
                }
            });

    // Apply a schema to an RDD of JavaBeans to get a DataFrame
    Dataset<Row> peopleDF = spark.createDataFrame(peopleRDD, Person.class);
    // Register the DataFrame as a temporary view
    peopleDF.createOrReplaceTempView("people");

    // SQL statements can be run by using the sql methods provided by spark
    Dataset<Row> teenagersDF = spark.sql("SELECT name FROM people WHERE age BETWEEN 13 AND 19");

    // The columns of a row in the result can be accessed by field index
    Encoder<String> stringEncoder = Encoders.STRING();
    Dataset<String> teenagerNamesByIndexDF = teenagersDF.map(new MapFunction<Row, String>() {
        @Override
        public String call(Row row) throws Exception {
            return "Name: " + row.getString(0);
        }
    }, stringEncoder);
    teenagerNamesByIndexDF.show();
    // +------------+
    // |       value|
    // +------------+
    // |Name: Justin|
    // +------------+

    // or by field name
    Dataset<String> teenagerNamesByFieldDF = teenagersDF.map(new MapFunction<Row, String>() {
        @Override
        public String call(Row row) throws Exception {
            return "Name: " + row.<String>getAs("name");
        }
    }, stringEncoder);
    teenagerNamesByFieldDF.show();
    // +------------+
    // |       value|
    // +------------+
    // |Name: Justin|
    // +------------+
    // $example off:schema_inferring$
}

From source file:com.andado.spark.examples.sql.JavaSparkSQLExample.java

License:Apache License

private static void runProgrammaticSchemaExample(SparkSession spark) {
    // $example on:programmatic_schema$
    // Create an RDD
    JavaRDD<String> peopleRDD = spark.sparkContext().textFile("examples/src/main/resources/people.txt", 1)
            .toJavaRDD();/*  w w  w.  ja v a 2s.com*/

    // The schema is encoded in a string
    String schemaString = "name age";

    // Generate the schema based on the string of schema
    List<StructField> fields = new ArrayList<>();
    for (String fieldName : schemaString.split(" ")) {
        StructField field = DataTypes.createStructField(fieldName, DataTypes.StringType, true);
        fields.add(field);
    }
    StructType schema = DataTypes.createStructType(fields);

    // Convert records of the RDD (people) to Rows
    JavaRDD<Row> rowRDD = peopleRDD.map(new Function<String, Row>() {
        @Override
        public Row call(String record) throws Exception {
            String[] attributes = record.split(",");
            return RowFactory.create(attributes[0], attributes[1].trim());
        }
    });

    // Apply the schema to the RDD
    Dataset<Row> peopleDataFrame = spark.createDataFrame(rowRDD, schema);

    // Creates a temporary view using the DataFrame
    peopleDataFrame.createOrReplaceTempView("people");

    // SQL can be run over a temporary view created using DataFrames
    Dataset<Row> results = spark.sql("SELECT name FROM people");

    // The results of SQL queries are DataFrames and support all the normal RDD operations
    // The columns of a row in the result can be accessed by field index or by field name
    Dataset<String> namesDS = results.map(new MapFunction<Row, String>() {
        @Override
        public String call(Row row) throws Exception {
            return "Name: " + row.getString(0);
        }
    }, Encoders.STRING());
    namesDS.show();
    // +-------------+
    // |        value|
    // +-------------+
    // |Name: Michael|
    // |   Name: Andy|
    // | Name: Justin|
    // +-------------+
    // $example off:programmatic_schema$
}

From source file:com.andado.spark.examples.sql.JavaSQLDataSourceExample.java

License:Apache License

private static void runBasicParquetExample(SparkSession spark) {
    // $example on:basic_parquet_example$
    Dataset<Row> peopleDF = spark.read().json("examples/src/main/resources/people.json");

    // DataFrames can be saved as Parquet files, maintaining the schema information
    peopleDF.write().parquet("people.parquet");

    // Read in the Parquet file created above.
    // Parquet files are self-describing so the schema is preserved
    // The result of loading a parquet file is also a DataFrame
    Dataset<Row> parquetFileDF = spark.read().parquet("people.parquet");

    // Parquet files can also be used to create a temporary view and then used in SQL statements
    parquetFileDF.createOrReplaceTempView("parquetFile");
    Dataset<Row> namesDF = spark.sql("SELECT name FROM parquetFile WHERE age BETWEEN 13 AND 19");
    Dataset<String> namesDS = namesDF.map(new MapFunction<Row, String>() {
        public String call(Row row) {
            return "Name: " + row.getString(0);
        }//from   w  w w.  jav a2 s .  c  o  m
    }, Encoders.STRING());
    namesDS.show();
    // +------------+
    // |       value|
    // +------------+
    // |Name: Justin|
    // +------------+
    // $example off:basic_parquet_example$
}

From source file:dbx.compute.spark.jobs.sql.hive.JavaSparkHiveExample.java

License:Apache License

public static void main(String[] args) {
    // $example on:spark_hive$
    // warehouseLocation points to the default location for managed databases and tables
    String warehouseLocation = "file:" + System.getProperty("user.dir") + "spark-warehouse";
    SparkSession spark = SparkSession.builder().appName("Java Spark Hive Example")
            .config("spark.sql.warehouse.dir", warehouseLocation).enableHiveSupport().getOrCreate();

    spark.sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING)");
    spark.sql("LOAD DATA LOCAL INPATH 'examples/src/main/resources/kv1.txt' INTO TABLE src");

    // Queries are expressed in HiveQL
    spark.sql("SELECT * FROM src").show();
    // +---+-------+
    // |key|  value|
    // +---+-------+
    // |238|val_238|
    // | 86| val_86|
    // |311|val_311|
    // .../*  w w w .  j a v  a 2  s. c  om*/

    // Aggregation queries are also supported.
    spark.sql("SELECT COUNT(*) FROM src").show();
    // +--------+
    // |count(1)|
    // +--------+
    // |    500 |
    // +--------+

    // The results of SQL queries are themselves DataFrames and support all normal functions.
    Dataset<Row> sqlDF = spark.sql("SELECT key, value FROM src WHERE key < 10 ORDER BY key");

    // The items in DaraFrames are of type Row, which lets you to access each column by ordinal.
    Dataset<String> stringsDS = sqlDF.map(new MapFunction<Row, String>() {
        public String call(Row row) throws Exception {
            return "Key: " + row.get(0) + ", Value: " + row.get(1);
        }
    }, Encoders.STRING());
    stringsDS.show();
    // +--------------------+
    // |               value|
    // +--------------------+
    // |Key: 0, Value: val_0|
    // |Key: 0, Value: val_0|
    // |Key: 0, Value: val_0|
    // ...

    // You can also use DataFrames to create temporary views within a HiveContext.
    List<Record> records = new ArrayList<Record>();
    for (int key = 1; key < 100; key++) {
        Record record = new Record();
        record.setKey(key);
        record.setValue("val_" + key);
        records.add(record);
    }
    Dataset<Row> recordsDF = spark.createDataFrame(records, Record.class);
    recordsDF.createOrReplaceTempView("records");

    // Queries can then join DataFrames data with data stored in Hive.
    spark.sql("SELECT * FROM records r JOIN src s ON r.key = s.key").show();
    // +---+------+---+------+
    // |key| value|key| value|
    // +---+------+---+------+
    // |  2| val_2|  2| val_2|
    // |  2| val_2|  2| val_2|
    // |  4| val_4|  4| val_4|
    // ...
    // $example off:spark_hive$

    spark.stop();
}

From source file:gtl.spark.java.example.apache.sql.streaming.JavaStructuredSessionization.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (args.length < 2) {
        System.err.println("Usage: JavaStructuredSessionization <hostname> <port>");
        System.exit(1);/*w  w w .j  a v a  2 s. c om*/
    }

    String host = args[0];
    int port = Integer.parseInt(args[1]);

    SparkSession spark = SparkSession.builder().appName("JavaStructuredSessionization").getOrCreate();

    // Create DataFrame representing the stream of input lines from connection to host:port
    Dataset<Row> lines = spark.readStream().format("socket").option("host", host).option("port", port)
            .option("includeTimestamp", true).load();

    FlatMapFunction<LineWithTimestamp, Event> linesToEvents = new FlatMapFunction<LineWithTimestamp, Event>() {
        @Override
        public Iterator<Event> call(LineWithTimestamp lineWithTimestamp) throws Exception {
            ArrayList<Event> eventList = new ArrayList<Event>();
            for (String word : lineWithTimestamp.getLine().split(" ")) {
                eventList.add(new Event(word, lineWithTimestamp.getTimestamp()));
            }
            return eventList.iterator();
        }
    };

    // Split the lines into words, treat words as sessionId of events
    Dataset<Event> events = lines.withColumnRenamed("value", "line").as(Encoders.bean(LineWithTimestamp.class))
            .flatMap(linesToEvents, Encoders.bean(Event.class));

    // Sessionize the events. Track number of events, start and end timestamps of session, and
    // and report session updates.
    //
    // Step 1: Define the state update function
    MapGroupsWithStateFunction<String, Event, SessionInfo, SessionUpdate> stateUpdateFunc = new MapGroupsWithStateFunction<String, Event, SessionInfo, SessionUpdate>() {
        @Override
        public SessionUpdate call(String sessionId, Iterator<Event> events, GroupState<SessionInfo> state)
                throws Exception {
            // If timed out, then remove session and send final update
            if (state.hasTimedOut()) {
                SessionUpdate finalUpdate = new SessionUpdate(sessionId, state.get().calculateDuration(),
                        state.get().getNumEvents(), true);
                state.remove();
                return finalUpdate;

            } else {
                // Find max and min timestamps in events
                long maxTimestampMs = Long.MIN_VALUE;
                long minTimestampMs = Long.MAX_VALUE;
                int numNewEvents = 0;
                while (events.hasNext()) {
                    Event e = events.next();
                    long timestampMs = e.getTimestamp().getTime();
                    maxTimestampMs = Math.max(timestampMs, maxTimestampMs);
                    minTimestampMs = Math.min(timestampMs, minTimestampMs);
                    numNewEvents += 1;
                }
                SessionInfo updatedSession = new SessionInfo();

                // Update start and end timestamps in session
                if (state.exists()) {
                    SessionInfo oldSession = state.get();
                    updatedSession.setNumEvents(oldSession.numEvents + numNewEvents);
                    updatedSession.setStartTimestampMs(oldSession.startTimestampMs);
                    updatedSession.setEndTimestampMs(Math.max(oldSession.endTimestampMs, maxTimestampMs));
                } else {
                    updatedSession.setNumEvents(numNewEvents);
                    updatedSession.setStartTimestampMs(minTimestampMs);
                    updatedSession.setEndTimestampMs(maxTimestampMs);
                }
                state.update(updatedSession);
                // Set timeout such that the session will be expired if no data received for 10 seconds
                state.setTimeoutDuration("10 seconds");
                return new SessionUpdate(sessionId, state.get().calculateDuration(), state.get().getNumEvents(),
                        false);
            }
        }
    };

    // Step 2: Apply the state update function to the events streaming Dataset grouped by sessionId
    Dataset<SessionUpdate> sessionUpdates = events.groupByKey(new MapFunction<Event, String>() {
        @Override
        public String call(Event event) throws Exception {
            return event.getSessionId();
        }
    }, Encoders.STRING()).mapGroupsWithState(stateUpdateFunc, Encoders.bean(SessionInfo.class),
            Encoders.bean(SessionUpdate.class), GroupStateTimeout.ProcessingTimeTimeout());

    // Start running the query that prints the session updates to the console
    StreamingQuery query = sessionUpdates.writeStream().outputMode("update").format("console").start();

    query.awaitTermination();
}

From source file:io.hops.examples.spark.kafka.StructuredStreamingKafka.java

License:Apache License

public static void main(String[] args) throws StreamingQueryException, InterruptedException {
    final String type = args[0];
    //Producer// www  .  j  a  v  a2s . c o m
    if (!Strings.isNullOrEmpty(type) && type.equalsIgnoreCase("producer")) {
        Set<String> topicsSet = new HashSet<>(Hops.getTopics());
        SparkConf sparkConf = new SparkConf().setAppName(Hops.getJobName());
        JavaSparkContext jsc = new JavaSparkContext(sparkConf);
        final List<HopsProducer> sparkProducers = new ArrayList<>();
        final DateFormat sdf = new SimpleDateFormat("yyyy/MM/dd HH:mm:ss:SSS");
        final List<String> messages = new ArrayList();
        final List<String> priorities = new ArrayList();
        final List<String> loggers = new ArrayList();

        /**
         * ********************************* Setup dummy test data ***********************************
         */
        messages.add("Container container_e01_1494850115055_0016_01_000002 succeeded");
        messages.add("Container container_e01_1494850115251_0015_01_000002 succeeded");
        messages.add("rollingMonitorInterval is set as -1. The log rolling mornitoring interval is disabled. "
                + "The logs will be aggregated after this application is finished.");
        messages.add("rollingMonitorInterval is set as -1. The log rolling mornitoring interval is disabled. "
                + "The logs will be aggregated after this application is finished.");
        messages.add("Sending out 2 container statuses: "
                + "[ContainerStatus: [ContainerId: container_e01_1494850115055_0016_01_000001, State: RUNNING, "
                + "Diagnostics: , ExitStatus: -1000, ], "
                + "ContainerStatus: [ContainerId: container_e01_1494850115055_0016_01_000002, "
                + "State: RUNNING, Diagnostics: , ExitStatus: -1000, ]]");
        messages.add("Node's health-status : true");
        messages.add("Cannot create writer for app application_1494433225517_0008. Skip log upload this time.");
        priorities.add("INFO");
        priorities.add("INFO");
        priorities.add("WARN");
        priorities.add("DEBUG");
        priorities.add("DEBUG");
        priorities.add("DEBUG");
        priorities.add("ERROR");
        loggers.add("org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainersMonitorImpl");
        loggers.add("org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainersMonitorImpl");
        loggers.add(
                "org.apache.hadoop.yarn.server.nodemanager.containermanager.logaggregation.AppLogAggregatorImpl");
        loggers.add("org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdaterImpl");
        loggers.add("org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdaterImpl");
        loggers.add("org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdaterImpl");
        loggers.add(
                "org.apache.hadoop.yarn.server.nodemanager.containermanager.logaggregation.AppLogAggregatorImpl");
        //End setup dummy data

        //Get a broker for the producer
        LOG.log(Level.INFO, "Producing to:{0}", Hops.getBrokerEndpointsList().get(0));
        Properties props = new Properties();
        props.put("bootstrap.servers", Hops.getBrokerEndpointsList().get(0));
        for (final String topic : topicsSet) {
            new Thread() {
                @Override
                public void run() {
                    try {
                        SparkProducer sparkProducer = Hops.getSparkProducer(topic, props);
                        sparkProducers.add(sparkProducer);
                        Map<String, String> message = new HashMap<>();
                        int i = 0;
                        //Produce Kafka messages to topic
                        while (true) {
                            message.put("message", messages.get(i % messages.size()));
                            message.put("priority", priorities.get(i % priorities.size()));
                            message.put("logger", loggers.get(i % loggers.size()));
                            Date date = new Date();
                            message.put("timestamp", sdf.format(date));
                            sparkProducer.produce(message);
                            Thread.sleep(100);
                            i++;
                        }
                    } catch (SchemaNotFoundException | CredentialsNotFoundException | InterruptedException ex) {
                        LOG.log(Level.SEVERE, ex.getMessage(), ex);
                    }
                }
            }.start();
        } //Keep application running
        Hops.shutdownGracefully(jsc);
        for (HopsProducer hopsProducer : sparkProducers) {
            hopsProducer.close();
        }
        //Consumer
    } else {
        // Create DataSet representing the stream of input lines from kafka
        DataStreamReader dsr = Hops.getSparkConsumer().getKafkaDataStreamReader();
        Dataset<Row> lines = dsr.load();

        // Generate running word count
        Dataset<LogEntry> logEntries = lines.map(new MapFunction<Row, LogEntry>() {
            @Override
            public LogEntry call(Row record) throws Exception {
                GenericRecord genericRecord = RECORD_INJECTIONS.entrySet().iterator().next().getValue()
                        .invert(record.getAs("value")).get();
                LogEntry logEntry = new LogEntry(genericRecord.get("timestamp").toString(),
                        genericRecord.get("priority").toString(), genericRecord.get("logger").toString(),
                        genericRecord.get("message").toString());
                return logEntry;
            }
        }, Encoders.bean(LogEntry.class));

        Dataset<String> logEntriesRaw = lines.map(new MapFunction<Row, String>() {
            @Override
            public String call(Row record) throws Exception {
                GenericRecord genericRecord = RECORD_INJECTIONS.entrySet().iterator().next().getValue()
                        .invert(record.getAs("value")).get();

                return genericRecord.toString();
            }
        }, Encoders.STRING());

        // Start running the query that prints the running counts to the console
        StreamingQuery queryFile = logEntries.writeStream().format("parquet")
                .option("path",
                        "/Projects/" + Hops.getProjectName() + "/Resources/data-parquet-" + Hops.getAppId())
                .option("checkpointLocation", "/Projects/" + Hops.getProjectName()
                        + "/Resources/checkpoint-parquet-" + Hops.getAppId())
                .trigger(Trigger.ProcessingTime(10000)).start();

        StreamingQuery queryFile2 = logEntriesRaw.writeStream().format("text")
                .option("path",
                        "/Projects/" + Hops.getProjectName() + "/Resources/data-text-" + Hops.getAppId())
                .option("checkpointLocation",
                        "/Projects/" + Hops.getProjectName() + "/Resources/checkpoint-text-" + Hops.getAppId())
                .trigger(Trigger.ProcessingTime(10000)).start();

        Hops.shutdownGracefully(queryFile);
    }
}

From source file:my.first.sql.hive.JavaSparkHiveExample.java

License:Apache License

public static void main(String[] args) {
    // $example on:spark_hive$
    // warehouseLocation points to the default location for managed databases and tables
    String warehouseLocation = "hdfs://192.168.1.26:9002/user/hive/warehouse";//"spark-warehouse";
    SparkSession spark = SparkSession.builder().master("local")//.master("spark://192.168.1.26:7077")//
            .appName("Java Spark Hive Example").config("spark.sql.warehouse.dir", warehouseLocation)
            .enableHiveSupport().getOrCreate();
    //    spark.sparkContext().addJar("E:\\work\\workspace\\spark-study\\target\\spark-study-1.0-SNAPSHOT.jar");//,E:\\work\\soft\\apache-maven-3.3.9\\resp\\org\\apache\\spark\\spark-hive_2.11\\2.0.1\\spark-hive_2.11-2.0.1.jar");

    spark.sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING)");
    spark.sql("LOAD DATA LOCAL INPATH 'src/main/resources/kv1.txt' INTO TABLE src");

    // Queries are expressed in HiveQL
    spark.sql("SELECT * FROM src").show();
    // +---+-------+
    // |key|  value|
    // +---+-------+
    // |238|val_238|
    // | 86| val_86|
    // |311|val_311|
    // .../*from  w ww.  j av a  2  s. c  om*/

    // Aggregation queries are also supported.
    spark.sql("SELECT COUNT(*) FROM src").show();
    // +--------+
    // |count(1)|
    // +--------+
    // |    500 |
    // +--------+

    // The results of SQL queries are themselves DataFrames and support all normal functions.
    Dataset<Row> sqlDF = spark.sql("SELECT key, value FROM src WHERE key < 10 ORDER BY key");
    sqlDF.show();

    // The items in DaraFrames are of type Row, which lets you to access each column by ordinal.
    Dataset<String> stringsDS = sqlDF.map(new MapFunction<Row, String>() {
        @Override
        public String call(Row row) throws Exception {
            return "Key: " + row.get(0) + ", Value: " + row.get(1);
        }
    }, Encoders.STRING());
    stringsDS.show();
    // +--------------------+
    // |               value|
    // +--------------------+
    // |Key: 0, Value: val_0|
    // |Key: 0, Value: val_0|
    // |Key: 0, Value: val_0|
    // ...

    // You can also use DataFrames to create temporary views within a SparkSession.
    List<Record> records = new ArrayList<>();
    for (int key = 1; key < 100; key++) {
        Record record = new Record();
        record.setKey(key);
        record.setValue("val_" + key);
        records.add(record);
    }
    Dataset<Row> recordsDF = spark.createDataFrame(records, Record.class);
    recordsDF.createOrReplaceTempView("records");

    // Queries can then join DataFrames data with data stored in Hive.
    spark.sql("SELECT * FROM records r JOIN src s ON r.key = s.key").show();
    // +---+------+---+------+
    // |key| value|key| value|
    // +---+------+---+------+
    // |  2| val_2|  2| val_2|
    // |  2| val_2|  2| val_2|
    // |  4| val_4|  4| val_4|
    // ...
    // $example off:spark_hive$

    spark.stop();
}

From source file:my.first.sql.JavaSparkSQLExample.java

License:Apache License

private static void runDatasetCreationExample(SparkSession spark) {
    // $example on:create_ds$
    // Create an instance of a Bean class
    Person person = new Person();
    person.setName("Andy");
    person.setAge(32);//from ww w .  j a va  2 s.  com

    // Encoders are created for Java beans
    Encoder<Person> personEncoder = Encoders.bean(Person.class);
    Dataset<Person> javaBeanDS = spark.createDataset(Collections.singletonList(person), personEncoder);
    javaBeanDS.show();
    // +---+----+
    // |age|name|
    // +---+----+
    // | 32|Andy|
    // +---+----+

    // Encoders for most common types are provided in class Encoders
    Encoder<Integer> integerEncoder = Encoders.INT();
    Dataset<Integer> primitiveDS = spark.createDataset(Arrays.asList(1, 2, 3), integerEncoder);
    Dataset<Integer> transformedDS = primitiveDS.map(new MapFunction<Integer, Integer>() {
        @Override
        public Integer call(Integer value) throws Exception {
            return value + 1;
        }
    }, integerEncoder);
    transformedDS.collect(); // Returns [2, 3, 4]

    // DataFrames can be converted to a Dataset by providing a class. Mapping based on name
    String path = jsonPath;
    Dataset<Person> peopleDS = spark.read().json(path).as(personEncoder);
    peopleDS.show();
    // +----+-------+
    // | age|   name|
    // +----+-------+
    // |null|Michael|
    // |  30|   Andy|
    // |  19| Justin|
    // +----+-------+
    // $example off:create_ds$
}