List of usage examples for org.apache.spark.sql SaveMode Ignore
SaveMode Ignore
To view the source code for org.apache.spark.sql SaveMode Ignore.
Click Source Link
From source file:com.hxr.bigdata.spark.example141.JavaSparkSQL.java
License:Apache License
public static void main(final String[] args) throws Exception { SparkConf sparkConf = new SparkConf().setAppName("JavaSparkSQL"); JavaSparkContext ctx = new JavaSparkContext(sparkConf); SQLContext sqlContext = new SQLContext(ctx); System.out.println("=== Data source: RDD ==="); // Load a text file and convert each line to a Java Bean. // ?javabean? // hdfs://127.0.0.1:9000/spark/people.txt JavaRDD<Person> people = ctx.textFile("/spark/people.txt").map(new Function<String, Person>() { public Person call(final String line) { String[] parts = line.split(","); Person person = new Person(); person.setName(parts[0]);//from ww w . j a v a2 s .c om person.setAge(Integer.parseInt(parts[1].trim())); return person; } }); // Apply a schema to an RDD of Java Beans and register it as a table. // schema?javabeanRDD DataFrame schemaPeople = sqlContext.createDataFrame(people, Person.class); schemaPeople.registerTempTable("people"); // SQL can be run over RDDs that have been registered as tables. // ??sql DataFrame teenagers = sqlContext.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19"); // The results of SQL queries are DataFrames and support all the normal RDD operations. // The columns of a row in the result can be accessed by ordinal. // DataFrame?RDD?RDD? List<String> teenagerNames = teenagers.toJavaRDD().map(new Function<Row, String>() { public String call(final Row row) { return "Name: " + row.getString(0); } }).collect(); for (String name : teenagerNames) { System.out.println(name); } // ------------------------??javabean-------------------------- // The schema is encoded in a string String schemaString = "name age"; // Generate the schema based on the string of schema List<StructField> fields = new ArrayList<StructField>(); for (String fieldName : schemaString.split(" ")) { fields.add(DataTypes.createStructField(fieldName, DataTypes.StringType, true)); } StructType schema = DataTypes.createStructType(fields); // Load a text file and convert each line to a JavaBean. JavaRDD<String> peopleT = ctx.textFile("/spark/people.txt"); // Convert records of the RDD (people) to Rows. JavaRDD<Row> rowRDD = peopleT.map(new Function<String, Row>() { public Row call(final String record) throws Exception { String[] fields = record.split(","); return RowFactory.create(fields[0], fields[1].trim()); } }); // Apply the schema to the RDD. DataFrame peopleDataFrame = sqlContext.createDataFrame(rowRDD, schema); // Register the DataFrame as a table. peopleDataFrame.registerTempTable("people"); // SQL can be run over RDDs that have been registered as tables. DataFrame results = sqlContext.sql("SELECT name FROM people"); // The results of SQL queries are DataFrames and support all the normal RDD operations. // The columns of a row in the result can be accessed by ordinal. List<String> names = results.javaRDD().map(new Function<Row, String>() { public String call(final Row row) { return "Name: " + row.getString(0); } }).collect(); System.out.println("=== Data source: Parquet File ==="); // DataFrames can be saved as parquet files, maintaining the schema information. // hdfs??hdfs://127.0.0.1:9000/user/hanxirui/people.parquet // SaveMode.ErrorIfExists (default) When saving a DataFrame to a data source, if data already exists, an exception is expected to be thrown. // SaveMode.Append When saving a DataFrame to a data source, if data/table already exists, contents of the DataFrame are expected to be appended to existing data. // SaveMode.Overwrite Overwrite mode means that when saving a DataFrame to a data source, if data/table already exists, existing data is expected to be overwritten by the contents of the DataFrame. // SaveMode.Ignore Ignore mode means that when saving a DataFrame to a data source, if data already exists, the save operation is expected to not save the contents of the DataFrame and to not change the existing data. This is similar to a CREATE TABLE IF NOT EXISTS in SQL. schemaPeople.write().mode(SaveMode.Ignore).parquet("people.parquet"); // Read in the parquet file created above. // Parquet files are self-describing so the schema is preserved. // The result of loading a parquet file is also a DataFrame. DataFrame parquetFile = sqlContext.read().parquet("people.parquet"); // Parquet files can also be registered as tables and then used in SQL statements. parquetFile.registerTempTable("parquetFile"); DataFrame teenagers2 = sqlContext.sql("SELECT name FROM parquetFile WHERE age >= 13 AND age <= 19"); teenagerNames = teenagers2.toJavaRDD().map(new Function<Row, String>() { public String call(final Row row) { return "Name: " + row.getString(0); } }).collect(); for (String name : teenagerNames) { System.out.println(name); } System.out.println("=== Data source: JSON Dataset ==="); // A JSON dataset is pointed by path. // The path can be either a single text file or a directory storing text files. String path = "/spark/people.json"; // Create a DataFrame from the file(s) pointed by path DataFrame peopleFromJsonFile = sqlContext.read().json(path); // Because the schema of a JSON dataset is automatically inferred, to write queries, // it is better to take a look at what is the schema. peopleFromJsonFile.printSchema(); // The schema of people is ... // root // |-- age: IntegerType // |-- name: StringType // Register this DataFrame as a table. peopleFromJsonFile.registerTempTable("people"); // SQL statements can be run by using the sql methods provided by sqlContext. DataFrame teenagers3 = sqlContext.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19"); // The results of SQL queries are DataFrame and support all the normal RDD operations. // The columns of a row in the result can be accessed by ordinal. teenagerNames = teenagers3.toJavaRDD().map(new Function<Row, String>() { public String call(final Row row) { return "Name: " + row.getString(0); } }).collect(); for (String name : teenagerNames) { System.out.println(name); } // Alternatively, a DataFrame can be created for a JSON dataset represented by // a RDD[String] storing one JSON object per string. List<String> jsonData = Arrays .asList("{\"name\":\"Yin\",\"address\":{\"city\":\"Columbus\",\"state\":\"Ohio\"}}"); JavaRDD<String> anotherPeopleRDD = ctx.parallelize(jsonData); DataFrame peopleFromJsonRDD = sqlContext.read().json(anotherPeopleRDD.rdd()); // Take a look at the schema of this new DataFrame. peopleFromJsonRDD.printSchema(); // The schema of anotherPeople is ... // root // |-- address: StructType // | |-- city: StringType // | |-- state: StringType // |-- name: StringType peopleFromJsonRDD.registerTempTable("people2"); DataFrame peopleWithCity = sqlContext.sql("SELECT name, address.city FROM people2"); List<String> nameAndCity = peopleWithCity.toJavaRDD().map(new Function<Row, String>() { public String call(final Row row) { return "Name: " + row.getString(0) + ", City: " + row.getString(1); } }).collect(); for (String name : nameAndCity) { System.out.println(name); } ctx.stop(); }
From source file:com.thinkbiganalytics.kylo.catalog.spark.SparkUtil.java
License:Apache License
/** * Returns the save mode with the specified name. * * @throws IllegalArgumentException if no save mode has the specified name *//*from w w w. j a v a 2 s . co m*/ @Nonnull public static SaveMode toSaveMode(final String s) { Preconditions.checkNotNull(s, "Save mode cannot be null"); switch (s.toLowerCase()) { case "overwrite": return SaveMode.Overwrite; case "append": return SaveMode.Append; case "ignore": return SaveMode.Ignore; case "error": case "errorifexists": case "default": return SaveMode.ErrorIfExists; default: log.debug("Unknown save mode: {}", s); throw new IllegalArgumentException("Unknown save mode: " + s + ". Accepted save modes are 'overwrite', 'append', 'ignore', 'error', 'errorifexists'."); } }
From source file:sql.JavaSQLDataSourceExample.java
License:Apache License
private static void runBasicDataSourceExample(SparkSession spark) { // $example on:generic_load_save_functions$ Dataset<Row> usersDF = spark.read() .load("/home/paul/spark/spark-2.1.0-bin-hadoop2.7/examples/src/main/resources/users.parquet"); usersDF.select("name", "favorite_color").write().mode("overwrite").save("namesAndFavColors.parquet"); usersDF.show();/*from ww w. java 2 s. co m*/ // $example off:generic_load_save_functions$ // $example on:manual_load_options$ Dataset<Row> peopleDF = spark.read().format("json") .load("/home/paul/spark/spark-2.1.0-bin-hadoop2.7/examples/src/main/resources/people.json"); peopleDF.select("name", "age").write().mode(SaveMode.Ignore).format("parquet").save("namesAndAges.parquet"); peopleDF.show(); // $example off:manual_load_options$ // $example on:direct_sql$ Dataset<Row> sqlDF = spark.sql( "SELECT * FROM parquet.`/home/paul/spark/spark-2.1.0-bin-hadoop2.7/examples/src/main/resources/users.parquet`"); sqlDF.show(); // $example off:direct_sql$ }