Example usage for org.apache.spark.sql SaveMode Overwrite

List of usage examples for org.apache.spark.sql SaveMode Overwrite

Introduction

In this page you can find the example usage for org.apache.spark.sql SaveMode Overwrite.

Prototype

SaveMode Overwrite

To view the source code for org.apache.spark.sql SaveMode Overwrite.

Click Source Link

Document

Overwrite mode means that when saving a DataFrame to a data source, if data/table already exists, existing data is expected to be overwritten by the contents of the DataFrame.

Usage

From source file:HoodieJavaApp.java

License:Apache License

public void run() throws Exception {

    // Spark session setup..
    SparkSession spark = SparkSession.builder().appName("Hoodie Spark APP")
            .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer").master("local[1]")
            .getOrCreate();//  www.  j  a  va2  s .com
    JavaSparkContext jssc = new JavaSparkContext(spark.sparkContext());
    FileSystem fs = FileSystem.get(jssc.hadoopConfiguration());

    // Generator of some records to be loaded in.
    HoodieTestDataGenerator dataGen = null;
    if (nonPartitionedTable) {
        // All data goes to base-path
        dataGen = new HoodieTestDataGenerator(new String[] { "" });
    } else {
        dataGen = new HoodieTestDataGenerator();
    }

    /**
     * Commit with only inserts
     */
    // Generate some input..
    List<String> records1 = DataSourceTestUtils
            .convertToStringList(dataGen.generateInserts("001"/* ignore */, 100));
    Dataset<Row> inputDF1 = spark.read().json(jssc.parallelize(records1, 2));

    // Save as hoodie dataset (copy on write)
    DataFrameWriter<Row> writer = inputDF1.write().format("com.uber.hoodie") // specify the hoodie source
            .option("hoodie.insert.shuffle.parallelism", "2") // any hoodie client config can be passed like this
            .option("hoodie.upsert.shuffle.parallelism", "2") // full list in HoodieWriteConfig & its package
            .option(DataSourceWriteOptions.STORAGE_TYPE_OPT_KEY(), tableType) // Hoodie Table Type
            .option(DataSourceWriteOptions.OPERATION_OPT_KEY(),
                    DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL()) // insert
            .option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY(), "_row_key") // This is the record key
            .option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY(), "partition") // this is the partition to place it into
            .option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY(), "timestamp") // use to combine duplicate records in input/with disk val
            .option(HoodieWriteConfig.TABLE_NAME, tableName) // Used by hive sync and queries
            .option(DataSourceWriteOptions.KEYGENERATOR_CLASS_OPT_KEY(),
                    nonPartitionedTable ? NonpartitionedKeyGenerator.class.getCanonicalName()
                            : SimpleKeyGenerator.class.getCanonicalName()) // Add Key Extractor
            .mode(SaveMode.Overwrite); // This will remove any existing data at path below, and create a

    updateHiveSyncConfig(writer);
    // new dataset if needed
    writer.save(tablePath); // ultimately where the dataset will be placed
    String commitInstantTime1 = HoodieDataSourceHelpers.latestCommit(fs, tablePath);
    logger.info("First commit at instant time :" + commitInstantTime1);

    /**
     * Commit that updates records
     */
    List<String> records2 = DataSourceTestUtils
            .convertToStringList(dataGen.generateUpdates("002"/* ignore */, 100));
    Dataset<Row> inputDF2 = spark.read().json(jssc.parallelize(records2, 2));
    writer = inputDF2.write().format("com.uber.hoodie").option("hoodie.insert.shuffle.parallelism", "2")
            .option("hoodie.upsert.shuffle.parallelism", "2")
            .option(DataSourceWriteOptions.STORAGE_TYPE_OPT_KEY(), tableType) // Hoodie Table Type
            .option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY(), "_row_key")
            .option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY(), "partition")
            .option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY(), "timestamp")
            .option(DataSourceWriteOptions.KEYGENERATOR_CLASS_OPT_KEY(),
                    nonPartitionedTable ? NonpartitionedKeyGenerator.class.getCanonicalName()
                            : SimpleKeyGenerator.class.getCanonicalName()) // Add Key Extractor
            .option(HoodieWriteConfig.TABLE_NAME, tableName).mode(SaveMode.Append);

    updateHiveSyncConfig(writer);
    writer.save(tablePath);
    String commitInstantTime2 = HoodieDataSourceHelpers.latestCommit(fs, tablePath);
    logger.info("Second commit at instant time :" + commitInstantTime1);

    /**
     * Read & do some queries
     */
    Dataset<Row> hoodieROViewDF = spark.read().format("com.uber.hoodie")
            // pass any path glob, can include hoodie & non-hoodie
            // datasets
            .load(tablePath + (nonPartitionedTable ? "/*" : "/*/*/*/*"));
    hoodieROViewDF.registerTempTable("hoodie_ro");
    spark.sql("describe hoodie_ro").show();
    // all trips whose fare was greater than 2.
    spark.sql("select fare, begin_lon, begin_lat, timestamp from hoodie_ro where fare > 2.0").show();

    if (tableType.equals(HoodieTableType.COPY_ON_WRITE.name())) {
        /**
         * Consume incrementally, only changes in commit 2 above. Currently only supported for COPY_ON_WRITE TABLE
         */
        Dataset<Row> hoodieIncViewDF = spark.read().format("com.uber.hoodie")
                .option(DataSourceReadOptions.VIEW_TYPE_OPT_KEY(),
                        DataSourceReadOptions.VIEW_TYPE_INCREMENTAL_OPT_VAL())
                .option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY(), commitInstantTime1) // Only changes in write 2 above
                .load(tablePath); // For incremental view, pass in the root/base path of dataset

        logger.info("You will only see records from : " + commitInstantTime2);
        hoodieIncViewDF.groupBy(hoodieIncViewDF.col("_hoodie_commit_time")).count().show();
    }
}

From source file:com.impetus.spark.client.FSClient.java

License:Apache License

/**
 * Write data in csv file.//  w ww  .  j  a  v a2 s.c  o m
 * 
 * @param df
 *            the df
 * @param outputFilePath
 *            the output file path
 * @return true, if successful
 */
private boolean writeDataInCsvFile(DataFrame df, String outputFilePath) {
    // TODO change savemode to APPEND or ErrorIfExists as supported by
    // latest version
    df.save(outputFilePath, SparkPropertiesConstants.SOURCE_CSV, SaveMode.Overwrite);
    return true;
}

From source file:com.impetus.spark.client.FSClient.java

License:Apache License

/**
 * Write data in json file.//w  w w . jav  a  2s.c  o  m
 * 
 * @param df
 *            the df
 * @param m
 *            the m
 * @param outputFilePath
 *            the output file path
 * @param sqlContext
 *            the sql context
 * @return true, if successful
 */
private boolean writeDataInJsonFile(DataFrame df, String outputFilePath) {
    // TODO change savemode to APPEND or ErrorIfExists as supported by
    // latest version
    df.save(outputFilePath, "json", SaveMode.Overwrite);
    return true;
}

From source file:com.thinkbiganalytics.kylo.catalog.spark.SparkUtil.java

License:Apache License

/**
 * Returns the save mode with the specified name.
 *
 * @throws IllegalArgumentException if no save mode has the specified name
 *///from www .j ava  2 s  .c  o m
@Nonnull
public static SaveMode toSaveMode(final String s) {
    Preconditions.checkNotNull(s, "Save mode cannot be null");
    switch (s.toLowerCase()) {
    case "overwrite":
        return SaveMode.Overwrite;

    case "append":
        return SaveMode.Append;

    case "ignore":
        return SaveMode.Ignore;

    case "error":
    case "errorifexists":
    case "default":
        return SaveMode.ErrorIfExists;

    default:
        log.debug("Unknown save mode: {}", s);
        throw new IllegalArgumentException("Unknown save mode: " + s
                + ". Accepted save modes are 'overwrite', 'append', 'ignore', 'error', 'errorifexists'.");
    }
}

From source file:eu.amidst.sparklink.core.io.DataSparkWriter.java

License:Apache License

public static void writeDataToFolder(DataSpark data, String path, SQLContext sqlContext, String formatFile)
        throws Exception {

    data.getDataFrame(sqlContext).write().mode(SaveMode.Overwrite).format(formatFile).save(path);
}

From source file:org.apache.phoenix.spark.datasource.v2.PhoenixDataSource.java

License:Apache License

@Override
public Optional<DataSourceWriter> createWriter(String writeUUID, StructType schema, SaveMode mode,
        DataSourceOptions options) {/*from  w  w  w. j  a v  a 2s  . c  om*/
    if (!mode.equals(SaveMode.Overwrite)) {
        throw new RuntimeException("SaveMode other than SaveMode.OverWrite is not supported");
    }
    if (!options.tableName().isPresent()) {
        throw new RuntimeException("No Phoenix option " + DataSourceOptions.TABLE_KEY + " defined");
    }
    if (!options.get(PhoenixDataSource.ZOOKEEPER_URL).isPresent()) {
        throw new RuntimeException("No Phoenix option " + PhoenixDataSource.ZOOKEEPER_URL + " defined");
    }

    PhoenixDataSourceWriteOptions writeOptions = createPhoenixDataSourceWriteOptions(options, schema);
    return Optional.of(new PhoenixDatasourceWriter(writeOptions));
}