List of usage examples for org.apache.spark.sql SaveMode Overwrite
SaveMode Overwrite
To view the source code for org.apache.spark.sql SaveMode Overwrite.
Click Source Link
From source file:HoodieJavaApp.java
License:Apache License
public void run() throws Exception { // Spark session setup.. SparkSession spark = SparkSession.builder().appName("Hoodie Spark APP") .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer").master("local[1]") .getOrCreate();// www. j a va2 s .com JavaSparkContext jssc = new JavaSparkContext(spark.sparkContext()); FileSystem fs = FileSystem.get(jssc.hadoopConfiguration()); // Generator of some records to be loaded in. HoodieTestDataGenerator dataGen = null; if (nonPartitionedTable) { // All data goes to base-path dataGen = new HoodieTestDataGenerator(new String[] { "" }); } else { dataGen = new HoodieTestDataGenerator(); } /** * Commit with only inserts */ // Generate some input.. List<String> records1 = DataSourceTestUtils .convertToStringList(dataGen.generateInserts("001"/* ignore */, 100)); Dataset<Row> inputDF1 = spark.read().json(jssc.parallelize(records1, 2)); // Save as hoodie dataset (copy on write) DataFrameWriter<Row> writer = inputDF1.write().format("com.uber.hoodie") // specify the hoodie source .option("hoodie.insert.shuffle.parallelism", "2") // any hoodie client config can be passed like this .option("hoodie.upsert.shuffle.parallelism", "2") // full list in HoodieWriteConfig & its package .option(DataSourceWriteOptions.STORAGE_TYPE_OPT_KEY(), tableType) // Hoodie Table Type .option(DataSourceWriteOptions.OPERATION_OPT_KEY(), DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL()) // insert .option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY(), "_row_key") // This is the record key .option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY(), "partition") // this is the partition to place it into .option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY(), "timestamp") // use to combine duplicate records in input/with disk val .option(HoodieWriteConfig.TABLE_NAME, tableName) // Used by hive sync and queries .option(DataSourceWriteOptions.KEYGENERATOR_CLASS_OPT_KEY(), nonPartitionedTable ? NonpartitionedKeyGenerator.class.getCanonicalName() : SimpleKeyGenerator.class.getCanonicalName()) // Add Key Extractor .mode(SaveMode.Overwrite); // This will remove any existing data at path below, and create a updateHiveSyncConfig(writer); // new dataset if needed writer.save(tablePath); // ultimately where the dataset will be placed String commitInstantTime1 = HoodieDataSourceHelpers.latestCommit(fs, tablePath); logger.info("First commit at instant time :" + commitInstantTime1); /** * Commit that updates records */ List<String> records2 = DataSourceTestUtils .convertToStringList(dataGen.generateUpdates("002"/* ignore */, 100)); Dataset<Row> inputDF2 = spark.read().json(jssc.parallelize(records2, 2)); writer = inputDF2.write().format("com.uber.hoodie").option("hoodie.insert.shuffle.parallelism", "2") .option("hoodie.upsert.shuffle.parallelism", "2") .option(DataSourceWriteOptions.STORAGE_TYPE_OPT_KEY(), tableType) // Hoodie Table Type .option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY(), "_row_key") .option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY(), "partition") .option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY(), "timestamp") .option(DataSourceWriteOptions.KEYGENERATOR_CLASS_OPT_KEY(), nonPartitionedTable ? NonpartitionedKeyGenerator.class.getCanonicalName() : SimpleKeyGenerator.class.getCanonicalName()) // Add Key Extractor .option(HoodieWriteConfig.TABLE_NAME, tableName).mode(SaveMode.Append); updateHiveSyncConfig(writer); writer.save(tablePath); String commitInstantTime2 = HoodieDataSourceHelpers.latestCommit(fs, tablePath); logger.info("Second commit at instant time :" + commitInstantTime1); /** * Read & do some queries */ Dataset<Row> hoodieROViewDF = spark.read().format("com.uber.hoodie") // pass any path glob, can include hoodie & non-hoodie // datasets .load(tablePath + (nonPartitionedTable ? "/*" : "/*/*/*/*")); hoodieROViewDF.registerTempTable("hoodie_ro"); spark.sql("describe hoodie_ro").show(); // all trips whose fare was greater than 2. spark.sql("select fare, begin_lon, begin_lat, timestamp from hoodie_ro where fare > 2.0").show(); if (tableType.equals(HoodieTableType.COPY_ON_WRITE.name())) { /** * Consume incrementally, only changes in commit 2 above. Currently only supported for COPY_ON_WRITE TABLE */ Dataset<Row> hoodieIncViewDF = spark.read().format("com.uber.hoodie") .option(DataSourceReadOptions.VIEW_TYPE_OPT_KEY(), DataSourceReadOptions.VIEW_TYPE_INCREMENTAL_OPT_VAL()) .option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY(), commitInstantTime1) // Only changes in write 2 above .load(tablePath); // For incremental view, pass in the root/base path of dataset logger.info("You will only see records from : " + commitInstantTime2); hoodieIncViewDF.groupBy(hoodieIncViewDF.col("_hoodie_commit_time")).count().show(); } }
From source file:com.impetus.spark.client.FSClient.java
License:Apache License
/** * Write data in csv file.// w ww . j a v a2 s.c o m * * @param df * the df * @param outputFilePath * the output file path * @return true, if successful */ private boolean writeDataInCsvFile(DataFrame df, String outputFilePath) { // TODO change savemode to APPEND or ErrorIfExists as supported by // latest version df.save(outputFilePath, SparkPropertiesConstants.SOURCE_CSV, SaveMode.Overwrite); return true; }
From source file:com.impetus.spark.client.FSClient.java
License:Apache License
/** * Write data in json file.//w w w . jav a 2s.c o m * * @param df * the df * @param m * the m * @param outputFilePath * the output file path * @param sqlContext * the sql context * @return true, if successful */ private boolean writeDataInJsonFile(DataFrame df, String outputFilePath) { // TODO change savemode to APPEND or ErrorIfExists as supported by // latest version df.save(outputFilePath, "json", SaveMode.Overwrite); return true; }
From source file:com.thinkbiganalytics.kylo.catalog.spark.SparkUtil.java
License:Apache License
/** * Returns the save mode with the specified name. * * @throws IllegalArgumentException if no save mode has the specified name *///from www .j ava 2 s .c o m @Nonnull public static SaveMode toSaveMode(final String s) { Preconditions.checkNotNull(s, "Save mode cannot be null"); switch (s.toLowerCase()) { case "overwrite": return SaveMode.Overwrite; case "append": return SaveMode.Append; case "ignore": return SaveMode.Ignore; case "error": case "errorifexists": case "default": return SaveMode.ErrorIfExists; default: log.debug("Unknown save mode: {}", s); throw new IllegalArgumentException("Unknown save mode: " + s + ". Accepted save modes are 'overwrite', 'append', 'ignore', 'error', 'errorifexists'."); } }
From source file:eu.amidst.sparklink.core.io.DataSparkWriter.java
License:Apache License
public static void writeDataToFolder(DataSpark data, String path, SQLContext sqlContext, String formatFile) throws Exception { data.getDataFrame(sqlContext).write().mode(SaveMode.Overwrite).format(formatFile).save(path); }
From source file:org.apache.phoenix.spark.datasource.v2.PhoenixDataSource.java
License:Apache License
@Override public Optional<DataSourceWriter> createWriter(String writeUUID, StructType schema, SaveMode mode, DataSourceOptions options) {/*from w w w. j a v a 2s . c om*/ if (!mode.equals(SaveMode.Overwrite)) { throw new RuntimeException("SaveMode other than SaveMode.OverWrite is not supported"); } if (!options.tableName().isPresent()) { throw new RuntimeException("No Phoenix option " + DataSourceOptions.TABLE_KEY + " defined"); } if (!options.get(PhoenixDataSource.ZOOKEEPER_URL).isPresent()) { throw new RuntimeException("No Phoenix option " + PhoenixDataSource.ZOOKEEPER_URL + " defined"); } PhoenixDataSourceWriteOptions writeOptions = createPhoenixDataSourceWriteOptions(options, schema); return Optional.of(new PhoenixDatasourceWriter(writeOptions)); }