List of usage examples for org.apache.spark.sql SaveMode Append
SaveMode Append
To view the source code for org.apache.spark.sql SaveMode Append.
Click Source Link
From source file:HoodieJavaApp.java
License:Apache License
public void run() throws Exception { // Spark session setup.. SparkSession spark = SparkSession.builder().appName("Hoodie Spark APP") .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer").master("local[1]") .getOrCreate();//w ww . j a va2 s .c om JavaSparkContext jssc = new JavaSparkContext(spark.sparkContext()); FileSystem fs = FileSystem.get(jssc.hadoopConfiguration()); // Generator of some records to be loaded in. HoodieTestDataGenerator dataGen = null; if (nonPartitionedTable) { // All data goes to base-path dataGen = new HoodieTestDataGenerator(new String[] { "" }); } else { dataGen = new HoodieTestDataGenerator(); } /** * Commit with only inserts */ // Generate some input.. List<String> records1 = DataSourceTestUtils .convertToStringList(dataGen.generateInserts("001"/* ignore */, 100)); Dataset<Row> inputDF1 = spark.read().json(jssc.parallelize(records1, 2)); // Save as hoodie dataset (copy on write) DataFrameWriter<Row> writer = inputDF1.write().format("com.uber.hoodie") // specify the hoodie source .option("hoodie.insert.shuffle.parallelism", "2") // any hoodie client config can be passed like this .option("hoodie.upsert.shuffle.parallelism", "2") // full list in HoodieWriteConfig & its package .option(DataSourceWriteOptions.STORAGE_TYPE_OPT_KEY(), tableType) // Hoodie Table Type .option(DataSourceWriteOptions.OPERATION_OPT_KEY(), DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL()) // insert .option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY(), "_row_key") // This is the record key .option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY(), "partition") // this is the partition to place it into .option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY(), "timestamp") // use to combine duplicate records in input/with disk val .option(HoodieWriteConfig.TABLE_NAME, tableName) // Used by hive sync and queries .option(DataSourceWriteOptions.KEYGENERATOR_CLASS_OPT_KEY(), nonPartitionedTable ? NonpartitionedKeyGenerator.class.getCanonicalName() : SimpleKeyGenerator.class.getCanonicalName()) // Add Key Extractor .mode(SaveMode.Overwrite); // This will remove any existing data at path below, and create a updateHiveSyncConfig(writer); // new dataset if needed writer.save(tablePath); // ultimately where the dataset will be placed String commitInstantTime1 = HoodieDataSourceHelpers.latestCommit(fs, tablePath); logger.info("First commit at instant time :" + commitInstantTime1); /** * Commit that updates records */ List<String> records2 = DataSourceTestUtils .convertToStringList(dataGen.generateUpdates("002"/* ignore */, 100)); Dataset<Row> inputDF2 = spark.read().json(jssc.parallelize(records2, 2)); writer = inputDF2.write().format("com.uber.hoodie").option("hoodie.insert.shuffle.parallelism", "2") .option("hoodie.upsert.shuffle.parallelism", "2") .option(DataSourceWriteOptions.STORAGE_TYPE_OPT_KEY(), tableType) // Hoodie Table Type .option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY(), "_row_key") .option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY(), "partition") .option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY(), "timestamp") .option(DataSourceWriteOptions.KEYGENERATOR_CLASS_OPT_KEY(), nonPartitionedTable ? NonpartitionedKeyGenerator.class.getCanonicalName() : SimpleKeyGenerator.class.getCanonicalName()) // Add Key Extractor .option(HoodieWriteConfig.TABLE_NAME, tableName).mode(SaveMode.Append); updateHiveSyncConfig(writer); writer.save(tablePath); String commitInstantTime2 = HoodieDataSourceHelpers.latestCommit(fs, tablePath); logger.info("Second commit at instant time :" + commitInstantTime1); /** * Read & do some queries */ Dataset<Row> hoodieROViewDF = spark.read().format("com.uber.hoodie") // pass any path glob, can include hoodie & non-hoodie // datasets .load(tablePath + (nonPartitionedTable ? "/*" : "/*/*/*/*")); hoodieROViewDF.registerTempTable("hoodie_ro"); spark.sql("describe hoodie_ro").show(); // all trips whose fare was greater than 2. spark.sql("select fare, begin_lon, begin_lat, timestamp from hoodie_ro where fare > 2.0").show(); if (tableType.equals(HoodieTableType.COPY_ON_WRITE.name())) { /** * Consume incrementally, only changes in commit 2 above. Currently only supported for COPY_ON_WRITE TABLE */ Dataset<Row> hoodieIncViewDF = spark.read().format("com.uber.hoodie") .option(DataSourceReadOptions.VIEW_TYPE_OPT_KEY(), DataSourceReadOptions.VIEW_TYPE_INCREMENTAL_OPT_VAL()) .option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY(), commitInstantTime1) // Only changes in write 2 above .load(tablePath); // For incremental view, pass in the root/base path of dataset logger.info("You will only see records from : " + commitInstantTime2); hoodieIncViewDF.groupBy(hoodieIncViewDF.col("_hoodie_commit_time")).count().show(); } }
From source file:HoodieJavaStreamingApp.java
License:Apache License
/** * Adding data to the streaming source and showing results over time * @param spark/*from www . j a va 2 s . com*/ * @param fs * @param inputDF1 * @param inputDF2 * @throws Exception */ public void show(SparkSession spark, FileSystem fs, Dataset<Row> inputDF1, Dataset<Row> inputDF2) throws Exception { inputDF1.write().mode(SaveMode.Append).json(streamingSourcePath); // wait for spark streaming to process one microbatch Thread.sleep(3000); String commitInstantTime1 = HoodieDataSourceHelpers.latestCommit(fs, tablePath); logger.info("First commit at instant time :" + commitInstantTime1); inputDF2.write().mode(SaveMode.Append).json(streamingSourcePath); // wait for spark streaming to process one microbatch Thread.sleep(3000); String commitInstantTime2 = HoodieDataSourceHelpers.latestCommit(fs, tablePath); logger.info("Second commit at instant time :" + commitInstantTime1); /** * Read & do some queries */ Dataset<Row> hoodieROViewDF = spark.read().format("com.uber.hoodie") // pass any path glob, can include hoodie & non-hoodie // datasets .load(tablePath + "/*/*/*/*"); hoodieROViewDF.registerTempTable("hoodie_ro"); spark.sql("describe hoodie_ro").show(); // all trips whose fare was greater than 2. spark.sql("select fare, begin_lon, begin_lat, timestamp from hoodie_ro where fare > 2.0").show(); if (tableType.equals(HoodieTableType.COPY_ON_WRITE.name())) { /** * Consume incrementally, only changes in commit 2 above. Currently only supported for COPY_ON_WRITE TABLE */ Dataset<Row> hoodieIncViewDF = spark.read().format("com.uber.hoodie") .option(DataSourceReadOptions.VIEW_TYPE_OPT_KEY(), DataSourceReadOptions.VIEW_TYPE_INCREMENTAL_OPT_VAL()) .option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY(), commitInstantTime1) // Only changes in write 2 above .load(tablePath); // For incremental view, pass in the root/base path of dataset logger.info("You will only see records from : " + commitInstantTime2); hoodieIncViewDF.groupBy(hoodieIncViewDF.col("_hoodie_commit_time")).count().show(); } }
From source file:com.impetus.spark.client.CassSparkClient.java
License:Apache License
@Override public void saveDataFrame(DataFrame dataFrame, Class<?> entityClazz, Map<String, Object> properties) { Map<String, String> options = new HashMap(); options.put("c_table", (String) properties.get(TABLE)); options.put(KEYSPACE, (String) properties.get(KEYSPACE)); // TODO update order dataFrame.save(SparkPropertiesConstants.SOURCE_CASSANDRA, SaveMode.Append, options); }
From source file:com.splicemachine.derby.stream.spark.SparkDataSet.java
License:Apache License
@SuppressWarnings({ "unchecked", "rawtypes" })
public DataSet<LocatedRow> writeParquetFile(int[] baseColumnMap, int[] partitionBy, String location,
String compression, OperationContext context) {
try {//from www. jav a 2s. co m
Dataset<Row> insertDF = SpliceSpark.getSession()
.createDataFrame(
rdd.map(new SparkSpliceFunctionWrapper<>(new CountWriteFunction(context)))
.map(new LocatedRowToRowFunction()),
context.getOperation().getExecRowDefinition().schema());
List<Column> cols = new ArrayList();
for (int i = 0; i < baseColumnMap.length; i++) {
cols.add(new Column(ValueRow.getNamedColumn(baseColumnMap[i])));
}
List<String> partitionByCols = new ArrayList();
for (int i = 0; i < partitionBy.length; i++) {
partitionByCols.add(ValueRow.getNamedColumn(partitionBy[i]));
}
insertDF.write().option(SPARK_COMPRESSION_OPTION, compression)
.partitionBy(partitionByCols.toArray(new String[partitionByCols.size()])).mode(SaveMode.Append)
.parquet(location);
ValueRow valueRow = new ValueRow(1);
valueRow.setColumn(1, new SQLLongint(context.getRecordsWritten()));
return new SparkDataSet<>(
SpliceSpark.getContext().parallelize(Collections.singletonList(new LocatedRow(valueRow)), 1));
} catch (Exception e) {
throw new RuntimeException(e);
}
}
From source file:com.splicemachine.derby.stream.spark.SparkDataSet.java
License:Apache License
@SuppressWarnings({ "unchecked", "rawtypes" })
public DataSet<LocatedRow> writeORCFile(int[] baseColumnMap, int[] partitionBy, String location,
String compression, OperationContext context) {
try {//www . java 2s. com
Dataset<Row> insertDF = SpliceSpark.getSession()
.createDataFrame(
rdd.map(new SparkSpliceFunctionWrapper<>(new CountWriteFunction(context)))
.map(new LocatedRowToRowFunction()),
context.getOperation().getExecRowDefinition().schema());
List<Column> cols = new ArrayList();
for (int i = 0; i < baseColumnMap.length; i++) {
cols.add(new Column(ValueRow.getNamedColumn(baseColumnMap[i])));
}
List<Column> partitionByCols = new ArrayList();
for (int i = 0; i < partitionBy.length; i++) {
partitionByCols.add(new Column(ValueRow.getNamedColumn(partitionBy[i])));
}
insertDF.write().option(SPARK_COMPRESSION_OPTION, compression)
.partitionBy(partitionByCols.toArray(new String[partitionByCols.size()])).mode(SaveMode.Append)
.orc(location);
ValueRow valueRow = new ValueRow(1);
valueRow.setColumn(1, new SQLLongint(context.getRecordsWritten()));
return new SparkDataSet<>(
SpliceSpark.getContext().parallelize(Collections.singletonList(new LocatedRow(valueRow)), 1));
} catch (Exception e) {
throw new RuntimeException(e);
}
}
From source file:com.splicemachine.derby.stream.spark.SparkDataSet.java
License:Apache License
@SuppressWarnings({ "unchecked", "rawtypes" })
public DataSet<LocatedRow> writeTextFile(SpliceOperation op, String location, String characterDelimiter,
String columnDelimiter, int[] baseColumnMap, OperationContext context) {
try {//from w ww . java 2 s.c om
Dataset<Row> insertDF = SpliceSpark.getSession()
.createDataFrame(
rdd.map(new SparkSpliceFunctionWrapper<>(new CountWriteFunction(context)))
.map(new LocatedRowToRowFunction()),
context.getOperation().getExecRowDefinition().schema());
List<Column> cols = new ArrayList();
for (int i = 0; i < baseColumnMap.length; i++) {
cols.add(new Column(ValueRow.getNamedColumn(baseColumnMap[i])));
}
insertDF.write().mode(SaveMode.Append).csv(location);
ValueRow valueRow = new ValueRow(1);
valueRow.setColumn(1, new SQLLongint(context.getRecordsWritten()));
return new SparkDataSet<>(
SpliceSpark.getContext().parallelize(Collections.singletonList(new LocatedRow(valueRow)), 1));
} catch (Exception e) {
throw new RuntimeException(e);
}
}
From source file:com.splicemachine.derby.stream.spark.SparkDataSetProcessor.java
License:Apache License
@Override public void createEmptyExternalFile(ExecRow execRows, int[] baseColumnMap, int[] partitionBy, String storedAs, String location) throws StandardException { try {/*from w w w . j av a 2s . c o m*/ Dataset<Row> empty = SpliceSpark.getSession().createDataFrame(new ArrayList<Row>(), execRows.schema()); List<Column> cols = new ArrayList(); for (int i = 0; i < baseColumnMap.length; i++) { cols.add(new Column(ValueRow.getNamedColumn(baseColumnMap[i]))); } List<String> partitionByCols = new ArrayList(); for (int i = 0; i < partitionBy.length; i++) { partitionByCols.add(ValueRow.getNamedColumn(partitionBy[i])); } if (!SIDriver.driver().fileSystem().getPath(location).toFile().exists()) { if (storedAs != null) { if (storedAs.toLowerCase().equals("p")) { empty.write().option("compression", "none") .partitionBy(partitionByCols.toArray(new String[partitionByCols.size()])) .mode(SaveMode.Append).parquet(location); } } } } catch (Exception e) { throw StandardException.newException(SQLState.EXTERNAL_TABLES_READ_FAILURE, e.getMessage()); } }
From source file:com.splicemachine.tutorials.model.RULPredictiveModel.java
License:Apache License
/** * Stored Procedure for Predictions// w ww. j a v a 2s . c o m */ public static void predictRUL(String sensorTableName, String resultsTableName, String savedModelPath, int loopinterval) { try { //Initialize variables if (sensorTableName == null || sensorTableName.length() == 0) sensorTableName = "IOT.SENSOR_AGG_1_VIEW"; if (resultsTableName == null || resultsTableName.length() == 0) resultsTableName = "IOT.PREDICTION_EXT"; if (savedModelPath == null || savedModelPath.length() == 0) savedModelPath = "/tmp"; if (!savedModelPath.endsWith("/")) savedModelPath = savedModelPath + "/"; savedModelPath += "model/"; String jdbcUrl = "jdbc:splice://localhost:1527/splicedb;user=splice;password=admin;useSpark=true"; Connection conn = DriverManager.getConnection(jdbcUrl); SparkSession sparkSession = SpliceSpark.getSession(); //Specify the data for predictions Map<String, String> options = new HashMap<String, String>(); options.put("driver", "com.splicemachine.db.jdbc.ClientDriver"); options.put("url", jdbcUrl); options.put("dbtable", sensorTableName); //Load Model to use for predictins CrossValidatorModel cvModel = CrossValidatorModel.load(savedModelPath); //Keep checking for new data and make predictions while (loopinterval > 0) { //Sensor data requiring predictions Dataset<Row> sensords = sparkSession.read().format("jdbc").options(options).load(); //prepare data sensords = sensords.na().fill(0); //make predictions Dataset<Row> predictions = cvModel.transform(sensords) .select("ENGINE_TYPE", "UNIT", "TIME", "prediction") .withColumnRenamed("prediction", "PREDICTION"); //Save predictions String fileName = "temp_pred_" + RandomStringUtils.randomAlphabetic(6).toLowerCase(); predictions.write().mode(SaveMode.Append).csv("/tmp/data_pred/predictions"); //Mark records for which predictions are made PreparedStatement pStmtDel = conn.prepareStatement( "delete from IOT.TO_PROCESS_SENSOR s where exists (select 1 from IOT.PREDICTIONS_EXT p where p.engine_type = s.engine_type and p.unit= s.unit and p.time=s.time )"); pStmtDel.execute(); pStmtDel.close(); } } catch (SQLException sqle) { System.out.println("Error :::::" + sqle.toString()); LOG.error("Exception in getColumnStatistics", sqle); sqle.printStackTrace(); } }
From source file:com.thinkbiganalytics.kylo.catalog.spark.SparkUtil.java
License:Apache License
/** * Returns the save mode with the specified name. * * @throws IllegalArgumentException if no save mode has the specified name *///from w w w . j a va2s . c om @Nonnull public static SaveMode toSaveMode(final String s) { Preconditions.checkNotNull(s, "Save mode cannot be null"); switch (s.toLowerCase()) { case "overwrite": return SaveMode.Overwrite; case "append": return SaveMode.Append; case "ignore": return SaveMode.Ignore; case "error": case "errorifexists": case "default": return SaveMode.ErrorIfExists; default: log.debug("Unknown save mode: {}", s); throw new IllegalArgumentException("Unknown save mode: " + s + ". Accepted save modes are 'overwrite', 'append', 'ignore', 'error', 'errorifexists'."); } }
From source file:com.thinkbiganalytics.spark.DataSet16.java
License:Apache License
@Override public void writeToTable(String partitionColumn, String fqnTable) { dataframe.write().partitionBy(partitionColumn).mode(SaveMode.Append).saveAsTable(fqnTable); }