Example usage for org.apache.spark.sql SaveMode Append

List of usage examples for org.apache.spark.sql SaveMode Append

Introduction

In this page you can find the example usage for org.apache.spark.sql SaveMode Append.

Prototype

SaveMode Append

To view the source code for org.apache.spark.sql SaveMode Append.

Click Source Link

Document

Append mode means that when saving a DataFrame to a data source, if data/table already exists, contents of the DataFrame are expected to be appended to existing data.

Usage

From source file:HoodieJavaApp.java

License:Apache License

public void run() throws Exception {

    // Spark session setup..
    SparkSession spark = SparkSession.builder().appName("Hoodie Spark APP")
            .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer").master("local[1]")
            .getOrCreate();//w  ww  .  j  a  va2 s .c  om
    JavaSparkContext jssc = new JavaSparkContext(spark.sparkContext());
    FileSystem fs = FileSystem.get(jssc.hadoopConfiguration());

    // Generator of some records to be loaded in.
    HoodieTestDataGenerator dataGen = null;
    if (nonPartitionedTable) {
        // All data goes to base-path
        dataGen = new HoodieTestDataGenerator(new String[] { "" });
    } else {
        dataGen = new HoodieTestDataGenerator();
    }

    /**
     * Commit with only inserts
     */
    // Generate some input..
    List<String> records1 = DataSourceTestUtils
            .convertToStringList(dataGen.generateInserts("001"/* ignore */, 100));
    Dataset<Row> inputDF1 = spark.read().json(jssc.parallelize(records1, 2));

    // Save as hoodie dataset (copy on write)
    DataFrameWriter<Row> writer = inputDF1.write().format("com.uber.hoodie") // specify the hoodie source
            .option("hoodie.insert.shuffle.parallelism", "2") // any hoodie client config can be passed like this
            .option("hoodie.upsert.shuffle.parallelism", "2") // full list in HoodieWriteConfig & its package
            .option(DataSourceWriteOptions.STORAGE_TYPE_OPT_KEY(), tableType) // Hoodie Table Type
            .option(DataSourceWriteOptions.OPERATION_OPT_KEY(),
                    DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL()) // insert
            .option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY(), "_row_key") // This is the record key
            .option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY(), "partition") // this is the partition to place it into
            .option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY(), "timestamp") // use to combine duplicate records in input/with disk val
            .option(HoodieWriteConfig.TABLE_NAME, tableName) // Used by hive sync and queries
            .option(DataSourceWriteOptions.KEYGENERATOR_CLASS_OPT_KEY(),
                    nonPartitionedTable ? NonpartitionedKeyGenerator.class.getCanonicalName()
                            : SimpleKeyGenerator.class.getCanonicalName()) // Add Key Extractor
            .mode(SaveMode.Overwrite); // This will remove any existing data at path below, and create a

    updateHiveSyncConfig(writer);
    // new dataset if needed
    writer.save(tablePath); // ultimately where the dataset will be placed
    String commitInstantTime1 = HoodieDataSourceHelpers.latestCommit(fs, tablePath);
    logger.info("First commit at instant time :" + commitInstantTime1);

    /**
     * Commit that updates records
     */
    List<String> records2 = DataSourceTestUtils
            .convertToStringList(dataGen.generateUpdates("002"/* ignore */, 100));
    Dataset<Row> inputDF2 = spark.read().json(jssc.parallelize(records2, 2));
    writer = inputDF2.write().format("com.uber.hoodie").option("hoodie.insert.shuffle.parallelism", "2")
            .option("hoodie.upsert.shuffle.parallelism", "2")
            .option(DataSourceWriteOptions.STORAGE_TYPE_OPT_KEY(), tableType) // Hoodie Table Type
            .option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY(), "_row_key")
            .option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY(), "partition")
            .option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY(), "timestamp")
            .option(DataSourceWriteOptions.KEYGENERATOR_CLASS_OPT_KEY(),
                    nonPartitionedTable ? NonpartitionedKeyGenerator.class.getCanonicalName()
                            : SimpleKeyGenerator.class.getCanonicalName()) // Add Key Extractor
            .option(HoodieWriteConfig.TABLE_NAME, tableName).mode(SaveMode.Append);

    updateHiveSyncConfig(writer);
    writer.save(tablePath);
    String commitInstantTime2 = HoodieDataSourceHelpers.latestCommit(fs, tablePath);
    logger.info("Second commit at instant time :" + commitInstantTime1);

    /**
     * Read & do some queries
     */
    Dataset<Row> hoodieROViewDF = spark.read().format("com.uber.hoodie")
            // pass any path glob, can include hoodie & non-hoodie
            // datasets
            .load(tablePath + (nonPartitionedTable ? "/*" : "/*/*/*/*"));
    hoodieROViewDF.registerTempTable("hoodie_ro");
    spark.sql("describe hoodie_ro").show();
    // all trips whose fare was greater than 2.
    spark.sql("select fare, begin_lon, begin_lat, timestamp from hoodie_ro where fare > 2.0").show();

    if (tableType.equals(HoodieTableType.COPY_ON_WRITE.name())) {
        /**
         * Consume incrementally, only changes in commit 2 above. Currently only supported for COPY_ON_WRITE TABLE
         */
        Dataset<Row> hoodieIncViewDF = spark.read().format("com.uber.hoodie")
                .option(DataSourceReadOptions.VIEW_TYPE_OPT_KEY(),
                        DataSourceReadOptions.VIEW_TYPE_INCREMENTAL_OPT_VAL())
                .option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY(), commitInstantTime1) // Only changes in write 2 above
                .load(tablePath); // For incremental view, pass in the root/base path of dataset

        logger.info("You will only see records from : " + commitInstantTime2);
        hoodieIncViewDF.groupBy(hoodieIncViewDF.col("_hoodie_commit_time")).count().show();
    }
}

From source file:HoodieJavaStreamingApp.java

License:Apache License

/**
 * Adding data to the streaming source and showing results over time
 * @param spark/*from www  . j a va  2 s . com*/
 * @param fs
 * @param inputDF1
 * @param inputDF2
 * @throws Exception
 */
public void show(SparkSession spark, FileSystem fs, Dataset<Row> inputDF1, Dataset<Row> inputDF2)
        throws Exception {
    inputDF1.write().mode(SaveMode.Append).json(streamingSourcePath);
    // wait for spark streaming to process one microbatch
    Thread.sleep(3000);
    String commitInstantTime1 = HoodieDataSourceHelpers.latestCommit(fs, tablePath);
    logger.info("First commit at instant time :" + commitInstantTime1);

    inputDF2.write().mode(SaveMode.Append).json(streamingSourcePath);
    // wait for spark streaming to process one microbatch
    Thread.sleep(3000);
    String commitInstantTime2 = HoodieDataSourceHelpers.latestCommit(fs, tablePath);
    logger.info("Second commit at instant time :" + commitInstantTime1);

    /**
     * Read & do some queries
     */
    Dataset<Row> hoodieROViewDF = spark.read().format("com.uber.hoodie")
            // pass any path glob, can include hoodie & non-hoodie
            // datasets
            .load(tablePath + "/*/*/*/*");
    hoodieROViewDF.registerTempTable("hoodie_ro");
    spark.sql("describe hoodie_ro").show();
    // all trips whose fare was greater than 2.
    spark.sql("select fare, begin_lon, begin_lat, timestamp from hoodie_ro where fare > 2.0").show();

    if (tableType.equals(HoodieTableType.COPY_ON_WRITE.name())) {
        /**
         * Consume incrementally, only changes in commit 2 above. Currently only supported for COPY_ON_WRITE TABLE
         */
        Dataset<Row> hoodieIncViewDF = spark.read().format("com.uber.hoodie")
                .option(DataSourceReadOptions.VIEW_TYPE_OPT_KEY(),
                        DataSourceReadOptions.VIEW_TYPE_INCREMENTAL_OPT_VAL())
                .option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY(), commitInstantTime1) // Only changes in write 2 above
                .load(tablePath); // For incremental view, pass in the root/base path of dataset

        logger.info("You will only see records from : " + commitInstantTime2);
        hoodieIncViewDF.groupBy(hoodieIncViewDF.col("_hoodie_commit_time")).count().show();
    }
}

From source file:com.impetus.spark.client.CassSparkClient.java

License:Apache License

@Override
public void saveDataFrame(DataFrame dataFrame, Class<?> entityClazz, Map<String, Object> properties) {
    Map<String, String> options = new HashMap();
    options.put("c_table", (String) properties.get(TABLE));
    options.put(KEYSPACE, (String) properties.get(KEYSPACE));

    // TODO update order
    dataFrame.save(SparkPropertiesConstants.SOURCE_CASSANDRA, SaveMode.Append, options);
}

From source file:com.splicemachine.derby.stream.spark.SparkDataSet.java

License:Apache License

@SuppressWarnings({ "unchecked", "rawtypes" })
public DataSet<LocatedRow> writeParquetFile(int[] baseColumnMap, int[] partitionBy, String location,
        String compression, OperationContext context) {
    try {//from   www. jav  a  2s.  co  m
        Dataset<Row> insertDF = SpliceSpark.getSession()
                .createDataFrame(
                        rdd.map(new SparkSpliceFunctionWrapper<>(new CountWriteFunction(context)))
                                .map(new LocatedRowToRowFunction()),
                        context.getOperation().getExecRowDefinition().schema());

        List<Column> cols = new ArrayList();
        for (int i = 0; i < baseColumnMap.length; i++) {
            cols.add(new Column(ValueRow.getNamedColumn(baseColumnMap[i])));
        }
        List<String> partitionByCols = new ArrayList();
        for (int i = 0; i < partitionBy.length; i++) {
            partitionByCols.add(ValueRow.getNamedColumn(partitionBy[i]));
        }
        insertDF.write().option(SPARK_COMPRESSION_OPTION, compression)
                .partitionBy(partitionByCols.toArray(new String[partitionByCols.size()])).mode(SaveMode.Append)
                .parquet(location);
        ValueRow valueRow = new ValueRow(1);
        valueRow.setColumn(1, new SQLLongint(context.getRecordsWritten()));
        return new SparkDataSet<>(
                SpliceSpark.getContext().parallelize(Collections.singletonList(new LocatedRow(valueRow)), 1));
    } catch (Exception e) {
        throw new RuntimeException(e);
    }
}

From source file:com.splicemachine.derby.stream.spark.SparkDataSet.java

License:Apache License

@SuppressWarnings({ "unchecked", "rawtypes" })
public DataSet<LocatedRow> writeORCFile(int[] baseColumnMap, int[] partitionBy, String location,
        String compression, OperationContext context) {
    try {//www . java 2s. com
        Dataset<Row> insertDF = SpliceSpark.getSession()
                .createDataFrame(
                        rdd.map(new SparkSpliceFunctionWrapper<>(new CountWriteFunction(context)))
                                .map(new LocatedRowToRowFunction()),
                        context.getOperation().getExecRowDefinition().schema());
        List<Column> cols = new ArrayList();
        for (int i = 0; i < baseColumnMap.length; i++) {
            cols.add(new Column(ValueRow.getNamedColumn(baseColumnMap[i])));
        }
        List<Column> partitionByCols = new ArrayList();
        for (int i = 0; i < partitionBy.length; i++) {
            partitionByCols.add(new Column(ValueRow.getNamedColumn(partitionBy[i])));
        }
        insertDF.write().option(SPARK_COMPRESSION_OPTION, compression)
                .partitionBy(partitionByCols.toArray(new String[partitionByCols.size()])).mode(SaveMode.Append)
                .orc(location);
        ValueRow valueRow = new ValueRow(1);
        valueRow.setColumn(1, new SQLLongint(context.getRecordsWritten()));
        return new SparkDataSet<>(
                SpliceSpark.getContext().parallelize(Collections.singletonList(new LocatedRow(valueRow)), 1));
    } catch (Exception e) {
        throw new RuntimeException(e);
    }
}

From source file:com.splicemachine.derby.stream.spark.SparkDataSet.java

License:Apache License

@SuppressWarnings({ "unchecked", "rawtypes" })
public DataSet<LocatedRow> writeTextFile(SpliceOperation op, String location, String characterDelimiter,
        String columnDelimiter, int[] baseColumnMap, OperationContext context) {

    try {//from  w  ww .  java  2  s.c om
        Dataset<Row> insertDF = SpliceSpark.getSession()
                .createDataFrame(
                        rdd.map(new SparkSpliceFunctionWrapper<>(new CountWriteFunction(context)))
                                .map(new LocatedRowToRowFunction()),
                        context.getOperation().getExecRowDefinition().schema());
        List<Column> cols = new ArrayList();
        for (int i = 0; i < baseColumnMap.length; i++) {
            cols.add(new Column(ValueRow.getNamedColumn(baseColumnMap[i])));
        }
        insertDF.write().mode(SaveMode.Append).csv(location);
        ValueRow valueRow = new ValueRow(1);
        valueRow.setColumn(1, new SQLLongint(context.getRecordsWritten()));
        return new SparkDataSet<>(
                SpliceSpark.getContext().parallelize(Collections.singletonList(new LocatedRow(valueRow)), 1));
    } catch (Exception e) {
        throw new RuntimeException(e);
    }
}

From source file:com.splicemachine.derby.stream.spark.SparkDataSetProcessor.java

License:Apache License

@Override
public void createEmptyExternalFile(ExecRow execRows, int[] baseColumnMap, int[] partitionBy, String storedAs,
        String location) throws StandardException {
    try {/*from w w w  .  j av a  2s .  c  o m*/

        Dataset<Row> empty = SpliceSpark.getSession().createDataFrame(new ArrayList<Row>(), execRows.schema());

        List<Column> cols = new ArrayList();
        for (int i = 0; i < baseColumnMap.length; i++) {
            cols.add(new Column(ValueRow.getNamedColumn(baseColumnMap[i])));
        }
        List<String> partitionByCols = new ArrayList();
        for (int i = 0; i < partitionBy.length; i++) {
            partitionByCols.add(ValueRow.getNamedColumn(partitionBy[i]));
        }

        if (!SIDriver.driver().fileSystem().getPath(location).toFile().exists()) {
            if (storedAs != null) {
                if (storedAs.toLowerCase().equals("p")) {
                    empty.write().option("compression", "none")
                            .partitionBy(partitionByCols.toArray(new String[partitionByCols.size()]))
                            .mode(SaveMode.Append).parquet(location);

                }
            }

        }

    }

    catch (Exception e) {
        throw StandardException.newException(SQLState.EXTERNAL_TABLES_READ_FAILURE, e.getMessage());
    }
}

From source file:com.splicemachine.tutorials.model.RULPredictiveModel.java

License:Apache License

/**
 * Stored Procedure for Predictions//  w ww. j a  v  a  2s  . c  o  m
 */
public static void predictRUL(String sensorTableName, String resultsTableName, String savedModelPath,
        int loopinterval) {

    try {

        //Initialize variables
        if (sensorTableName == null || sensorTableName.length() == 0)
            sensorTableName = "IOT.SENSOR_AGG_1_VIEW";
        if (resultsTableName == null || resultsTableName.length() == 0)
            resultsTableName = "IOT.PREDICTION_EXT";
        if (savedModelPath == null || savedModelPath.length() == 0)
            savedModelPath = "/tmp";
        if (!savedModelPath.endsWith("/"))
            savedModelPath = savedModelPath + "/";
        savedModelPath += "model/";

        String jdbcUrl = "jdbc:splice://localhost:1527/splicedb;user=splice;password=admin;useSpark=true";
        Connection conn = DriverManager.getConnection(jdbcUrl);

        SparkSession sparkSession = SpliceSpark.getSession();

        //Specify the data for predictions
        Map<String, String> options = new HashMap<String, String>();
        options.put("driver", "com.splicemachine.db.jdbc.ClientDriver");
        options.put("url", jdbcUrl);
        options.put("dbtable", sensorTableName);

        //Load Model to use for predictins
        CrossValidatorModel cvModel = CrossValidatorModel.load(savedModelPath);

        //Keep checking for new data and make predictions
        while (loopinterval > 0) {
            //Sensor data requiring predictions
            Dataset<Row> sensords = sparkSession.read().format("jdbc").options(options).load();

            //prepare data
            sensords = sensords.na().fill(0);

            //make predictions
            Dataset<Row> predictions = cvModel.transform(sensords)
                    .select("ENGINE_TYPE", "UNIT", "TIME", "prediction")
                    .withColumnRenamed("prediction", "PREDICTION");

            //Save predictions
            String fileName = "temp_pred_" + RandomStringUtils.randomAlphabetic(6).toLowerCase();

            predictions.write().mode(SaveMode.Append).csv("/tmp/data_pred/predictions");

            //Mark records for which predictions are made
            PreparedStatement pStmtDel = conn.prepareStatement(
                    "delete  from IOT.TO_PROCESS_SENSOR s where exists (select 1 from IOT.PREDICTIONS_EXT p where p.engine_type = s.engine_type and p.unit= s.unit and p.time=s.time )");
            pStmtDel.execute();
            pStmtDel.close();
        }

    } catch (SQLException sqle) {
        System.out.println("Error  :::::" + sqle.toString());
        LOG.error("Exception in getColumnStatistics", sqle);
        sqle.printStackTrace();
    }

}

From source file:com.thinkbiganalytics.kylo.catalog.spark.SparkUtil.java

License:Apache License

/**
 * Returns the save mode with the specified name.
 *
 * @throws IllegalArgumentException if no save mode has the specified name
 *///from   w  w  w . j a  va2s  .  c om
@Nonnull
public static SaveMode toSaveMode(final String s) {
    Preconditions.checkNotNull(s, "Save mode cannot be null");
    switch (s.toLowerCase()) {
    case "overwrite":
        return SaveMode.Overwrite;

    case "append":
        return SaveMode.Append;

    case "ignore":
        return SaveMode.Ignore;

    case "error":
    case "errorifexists":
    case "default":
        return SaveMode.ErrorIfExists;

    default:
        log.debug("Unknown save mode: {}", s);
        throw new IllegalArgumentException("Unknown save mode: " + s
                + ". Accepted save modes are 'overwrite', 'append', 'ignore', 'error', 'errorifexists'.");
    }
}

From source file:com.thinkbiganalytics.spark.DataSet16.java

License:Apache License

@Override
public void writeToTable(String partitionColumn, String fqnTable) {
    dataframe.write().partitionBy(partitionColumn).mode(SaveMode.Append).saveAsTable(fqnTable);
}