Example usage for org.apache.spark.sql SaveMode Ignore

List of usage examples for org.apache.spark.sql SaveMode Ignore

Introduction

In this page you can find the example usage for org.apache.spark.sql SaveMode Ignore.

Prototype

SaveMode Ignore

To view the source code for org.apache.spark.sql SaveMode Ignore.

Click Source Link

Document

Ignore mode means that when saving a DataFrame to a data source, if data already exists, the save operation is expected to not save the contents of the DataFrame and to not change the existing data.

Usage

From source file:com.hxr.bigdata.spark.example141.JavaSparkSQL.java

License:Apache License

public static void main(final String[] args) throws Exception {
    SparkConf sparkConf = new SparkConf().setAppName("JavaSparkSQL");
    JavaSparkContext ctx = new JavaSparkContext(sparkConf);
    SQLContext sqlContext = new SQLContext(ctx);

    System.out.println("=== Data source: RDD ===");
    // Load a text file and convert each line to a Java Bean.
    // ?javabean?
    //         hdfs://127.0.0.1:9000/spark/people.txt
    JavaRDD<Person> people = ctx.textFile("/spark/people.txt").map(new Function<String, Person>() {

        public Person call(final String line) {
            String[] parts = line.split(",");

            Person person = new Person();
            person.setName(parts[0]);//from   ww  w . j  a v a2  s  .c  om
            person.setAge(Integer.parseInt(parts[1].trim()));

            return person;
        }
    });

    // Apply a schema to an RDD of Java Beans and register it as a table.
    // schema?javabeanRDD
    DataFrame schemaPeople = sqlContext.createDataFrame(people, Person.class);
    schemaPeople.registerTempTable("people");

    // SQL can be run over RDDs that have been registered as tables.
    // ??sql
    DataFrame teenagers = sqlContext.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19");

    // The results of SQL queries are DataFrames and support all the normal RDD operations.
    // The columns of a row in the result can be accessed by ordinal.
    //        DataFrame?RDD?RDD?
    List<String> teenagerNames = teenagers.toJavaRDD().map(new Function<Row, String>() {

        public String call(final Row row) {
            return "Name: " + row.getString(0);
        }
    }).collect();
    for (String name : teenagerNames) {
        System.out.println(name);
    }
    //        ------------------------??javabean--------------------------
    // The schema is encoded in a string
    String schemaString = "name age";

    // Generate the schema based on the string of schema
    List<StructField> fields = new ArrayList<StructField>();
    for (String fieldName : schemaString.split(" ")) {
        fields.add(DataTypes.createStructField(fieldName, DataTypes.StringType, true));
    }
    StructType schema = DataTypes.createStructType(fields);

    // Load a text file and convert each line to a JavaBean.
    JavaRDD<String> peopleT = ctx.textFile("/spark/people.txt");

    // Convert records of the RDD (people) to Rows.
    JavaRDD<Row> rowRDD = peopleT.map(new Function<String, Row>() {
        public Row call(final String record) throws Exception {
            String[] fields = record.split(",");
            return RowFactory.create(fields[0], fields[1].trim());
        }
    });

    // Apply the schema to the RDD.
    DataFrame peopleDataFrame = sqlContext.createDataFrame(rowRDD, schema);

    // Register the DataFrame as a table.
    peopleDataFrame.registerTempTable("people");

    // SQL can be run over RDDs that have been registered as tables.
    DataFrame results = sqlContext.sql("SELECT name FROM people");

    // The results of SQL queries are DataFrames and support all the normal RDD operations.
    // The columns of a row in the result can be accessed by ordinal.
    List<String> names = results.javaRDD().map(new Function<Row, String>() {
        public String call(final Row row) {
            return "Name: " + row.getString(0);
        }
    }).collect();

    System.out.println("=== Data source: Parquet File ===");
    // DataFrames can be saved as parquet files, maintaining the schema information.
    //        hdfs??hdfs://127.0.0.1:9000/user/hanxirui/people.parquet
    //        SaveMode.ErrorIfExists (default)  When saving a DataFrame to a data source, if data already exists, an exception is expected to be thrown.
    //        SaveMode.Append When saving a DataFrame to a data source, if data/table already exists, contents of the DataFrame are expected to be appended to existing data.
    //        SaveMode.Overwrite  Overwrite mode means that when saving a DataFrame to a data source, if data/table already exists, existing data is expected to be overwritten by the contents of the DataFrame.
    //        SaveMode.Ignore  Ignore mode means that when saving a DataFrame to a data source, if data already exists, the save operation is expected to not save the contents of the DataFrame and to not change the existing data. This is similar to a CREATE TABLE IF NOT EXISTS in SQL.
    schemaPeople.write().mode(SaveMode.Ignore).parquet("people.parquet");

    // Read in the parquet file created above.
    // Parquet files are self-describing so the schema is preserved.
    // The result of loading a parquet file is also a DataFrame.
    DataFrame parquetFile = sqlContext.read().parquet("people.parquet");

    // Parquet files can also be registered as tables and then used in SQL statements.
    parquetFile.registerTempTable("parquetFile");
    DataFrame teenagers2 = sqlContext.sql("SELECT name FROM parquetFile WHERE age >= 13 AND age <= 19");
    teenagerNames = teenagers2.toJavaRDD().map(new Function<Row, String>() {

        public String call(final Row row) {
            return "Name: " + row.getString(0);
        }
    }).collect();
    for (String name : teenagerNames) {
        System.out.println(name);
    }

    System.out.println("=== Data source: JSON Dataset ===");
    // A JSON dataset is pointed by path.
    // The path can be either a single text file or a directory storing text files.
    String path = "/spark/people.json";
    // Create a DataFrame from the file(s) pointed by path
    DataFrame peopleFromJsonFile = sqlContext.read().json(path);

    // Because the schema of a JSON dataset is automatically inferred, to write queries,
    // it is better to take a look at what is the schema.
    peopleFromJsonFile.printSchema();
    // The schema of people is ...
    // root
    // |-- age: IntegerType
    // |-- name: StringType

    // Register this DataFrame as a table.
    peopleFromJsonFile.registerTempTable("people");

    // SQL statements can be run by using the sql methods provided by sqlContext.
    DataFrame teenagers3 = sqlContext.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19");

    // The results of SQL queries are DataFrame and support all the normal RDD operations.
    // The columns of a row in the result can be accessed by ordinal.
    teenagerNames = teenagers3.toJavaRDD().map(new Function<Row, String>() {

        public String call(final Row row) {
            return "Name: " + row.getString(0);
        }
    }).collect();
    for (String name : teenagerNames) {
        System.out.println(name);
    }

    // Alternatively, a DataFrame can be created for a JSON dataset represented by
    // a RDD[String] storing one JSON object per string.
    List<String> jsonData = Arrays
            .asList("{\"name\":\"Yin\",\"address\":{\"city\":\"Columbus\",\"state\":\"Ohio\"}}");
    JavaRDD<String> anotherPeopleRDD = ctx.parallelize(jsonData);
    DataFrame peopleFromJsonRDD = sqlContext.read().json(anotherPeopleRDD.rdd());

    // Take a look at the schema of this new DataFrame.
    peopleFromJsonRDD.printSchema();
    // The schema of anotherPeople is ...
    // root
    // |-- address: StructType
    // | |-- city: StringType
    // | |-- state: StringType
    // |-- name: StringType

    peopleFromJsonRDD.registerTempTable("people2");

    DataFrame peopleWithCity = sqlContext.sql("SELECT name, address.city FROM people2");
    List<String> nameAndCity = peopleWithCity.toJavaRDD().map(new Function<Row, String>() {

        public String call(final Row row) {
            return "Name: " + row.getString(0) + ", City: " + row.getString(1);
        }
    }).collect();
    for (String name : nameAndCity) {
        System.out.println(name);
    }

    ctx.stop();
}

From source file:com.thinkbiganalytics.kylo.catalog.spark.SparkUtil.java

License:Apache License

/**
 * Returns the save mode with the specified name.
 *
 * @throws IllegalArgumentException if no save mode has the specified name
 *//*from w w  w.  j  a  v a  2  s  . co  m*/
@Nonnull
public static SaveMode toSaveMode(final String s) {
    Preconditions.checkNotNull(s, "Save mode cannot be null");
    switch (s.toLowerCase()) {
    case "overwrite":
        return SaveMode.Overwrite;

    case "append":
        return SaveMode.Append;

    case "ignore":
        return SaveMode.Ignore;

    case "error":
    case "errorifexists":
    case "default":
        return SaveMode.ErrorIfExists;

    default:
        log.debug("Unknown save mode: {}", s);
        throw new IllegalArgumentException("Unknown save mode: " + s
                + ". Accepted save modes are 'overwrite', 'append', 'ignore', 'error', 'errorifexists'.");
    }
}

From source file:sql.JavaSQLDataSourceExample.java

License:Apache License

private static void runBasicDataSourceExample(SparkSession spark) {
    // $example on:generic_load_save_functions$
    Dataset<Row> usersDF = spark.read()
            .load("/home/paul/spark/spark-2.1.0-bin-hadoop2.7/examples/src/main/resources/users.parquet");
    usersDF.select("name", "favorite_color").write().mode("overwrite").save("namesAndFavColors.parquet");
    usersDF.show();/*from ww w.  java 2 s.  co m*/
    // $example off:generic_load_save_functions$
    // $example on:manual_load_options$
    Dataset<Row> peopleDF = spark.read().format("json")
            .load("/home/paul/spark/spark-2.1.0-bin-hadoop2.7/examples/src/main/resources/people.json");
    peopleDF.select("name", "age").write().mode(SaveMode.Ignore).format("parquet").save("namesAndAges.parquet");
    peopleDF.show();
    // $example off:manual_load_options$
    // $example on:direct_sql$
    Dataset<Row> sqlDF = spark.sql(
            "SELECT * FROM parquet.`/home/paul/spark/spark-2.1.0-bin-hadoop2.7/examples/src/main/resources/users.parquet`");
    sqlDF.show();
    // $example off:direct_sql$
}