List of usage examples for org.apache.spark.sql.types DataTypes createStructField
public static StructField createStructField(String name, DataType dataType, boolean nullable)
From source file:KafkaSparkMongo.java
License:Apache License
public static void main(String[] args) throws Exception { if (args.length < 2) { System.err.println("Usage: JavaDirectKafkaWordCount <brokers> <topics>\n" + " <brokers> is a list of one or more Kafka brokers\n" + " <topics> is a list of one or more kafka topics to consume from\n\n"); System.exit(1);//from w ww.ja v a 2s. co m } String brokers = args[0]; String topics = args[1]; String UriMongo = "mongodb://localhost/streamSparkFinal.coll"; dropDatabase(UriMongo); // Create the context with a 1 second batch size SparkConf sparkConf = new SparkConf().setAppName("JavaNetworkWordCount") .set("spark.app.id", "MongoSparkConnectorTour").set("spark.mongodb.input.uri", UriMongo) .set("spark.mongodb.output.uri", UriMongo); JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, Durations.seconds(5)); /** Create a JavaReceiverInputDStream on target ip:port and count the * words in input stream of \n delimited text (eg. generated by 'nc') * Note that no duplication in storage level only for running locally. * Replication necessary in distributed scenario for fault tolerance. */ Set<String> topicsSet = new HashSet<>(Arrays.asList(topics.split(","))); Map<String, String> kafkaParams = new HashMap<>(); kafkaParams.put("metadata.broker.list", brokers); // Create direct kafka stream with brokers and topics JavaPairInputDStream<String, String> messages = KafkaUtils.createDirectStream(ssc, String.class, String.class, StringDecoder.class, StringDecoder.class, kafkaParams, topicsSet); messages.print(); JavaDStream<String> lines = messages.map(x -> x._2()); JavaDStream<Tuple7<String, String, String, String, String, String, String>> words = lines.map(y -> { String[] wordy = SPACE.split(y); return new Tuple7<>(wordy[0], wordy[1], wordy[2], wordy[3], wordy[4], wordy[5], wordy[6]); }); words.foreachRDD(rdd -> { List<StructField> subFields = new ArrayList<>(); subFields.add(DataTypes.createStructField("X", DataTypes.DoubleType, true)); subFields.add(DataTypes.createStructField("Y", DataTypes.DoubleType, true)); subFields.add(DataTypes.createStructField("z", DataTypes.DoubleType, true)); List<StructField> fields = new ArrayList<>(); fields.add(DataTypes.createStructField("Serial", DataTypes.StringType, true)); fields.add(DataTypes.createStructField("Zone", DataTypes.StringType, true)); fields.add(DataTypes.createStructField("Group", DataTypes.StringType, true)); fields.add(DataTypes.createStructField("coord", DataTypes.createStructType(subFields), true)); fields.add(DataTypes.createStructField("Time", DataTypes.TimestampType, true)); StructType schema = DataTypes.createStructType(fields); SparkSession spark = JavaSparkSessionSingleton.getInstance(rdd.context().getConf()); JavaRDD<Row> rowRDD = rdd .map(palabra -> RowFactory.create(palabra._1(), palabra._2(), palabra._3(), RowFactory.create(Double.parseDouble(palabra._4()), Double.parseDouble(palabra._5()), Double.parseDouble(palabra._6())), Timestamp.from(Instant.parse(palabra._7())))); Dataset<Row> wordsDataFrame = spark.createDataFrame(rowRDD, schema); wordsDataFrame.show(); MongoSpark.write(wordsDataFrame).option("collection", "pruebaF").mode("append").save(); }); ssc.start(); ssc.awaitTermination(); }
From source file:com.alpine.plugin.samples.ver1_0.JavaCountPlugin.CountPluginSparkJob.java
License:Open Source License
public DataFrame transform(OperatorParameters params, DataFrame inputDataFrame, SparkRuntimeUtils sparkUtils, OperatorListener listener) {//from w w w. j a v a 2 s . co m String groupByVar = params.getTabularDatasetSelectedColumn(GroupByParamKey)._2(); listener.notifyMessage("Starting the DataFrame Transformation"); DataFrame selectedData = inputDataFrame.select(groupByVar); DataFrame df = selectedData.groupBy(groupByVar).count(); //customize the output schema List<StructField> fields = new ArrayList<StructField>(); fields.add(DataTypes.createStructField(groupByVar, DataTypes.StringType, true)); fields.add(DataTypes.createStructField("GroupCount", DataTypes.LongType, true)); StructType dfSchema = DataTypes.createStructType(fields); return inputDataFrame.sqlContext().createDataFrame(df.rdd(), dfSchema); }
From source file:com.andado.spark.examples.ml.JavaElementwiseProductExample.java
License:Apache License
public static void main(String[] args) { SparkSession spark = SparkSession.builder().appName("JavaElementwiseProductExample").getOrCreate(); // $example on$ // Create some vector data; also works for sparse vectors List<Row> data = Arrays.asList(RowFactory.create("a", Vectors.dense(1.0, 2.0, 3.0)), RowFactory.create("b", Vectors.dense(4.0, 5.0, 6.0))); List<StructField> fields = new ArrayList<>(2); fields.add(DataTypes.createStructField("id", DataTypes.StringType, false)); fields.add(DataTypes.createStructField("vector", new VectorUDT(), false)); StructType schema = DataTypes.createStructType(fields); Dataset<Row> dataFrame = spark.createDataFrame(data, schema); Vector transformingVector = Vectors.dense(0.0, 1.0, 2.0); ElementwiseProduct transformer = new ElementwiseProduct().setScalingVec(transformingVector) .setInputCol("vector").setOutputCol("transformedVector"); // Batch transform the vectors to create new column: transformer.transform(dataFrame).show(); // $example off$ spark.stop();/*from www . j av a2 s. c om*/ }
From source file:com.andado.spark.examples.sql.JavaSparkSQLExample.java
License:Apache License
private static void runProgrammaticSchemaExample(SparkSession spark) { // $example on:programmatic_schema$ // Create an RDD JavaRDD<String> peopleRDD = spark.sparkContext().textFile("examples/src/main/resources/people.txt", 1) .toJavaRDD();/*from ww w .j av a 2s .co m*/ // The schema is encoded in a string String schemaString = "name age"; // Generate the schema based on the string of schema List<StructField> fields = new ArrayList<>(); for (String fieldName : schemaString.split(" ")) { StructField field = DataTypes.createStructField(fieldName, DataTypes.StringType, true); fields.add(field); } StructType schema = DataTypes.createStructType(fields); // Convert records of the RDD (people) to Rows JavaRDD<Row> rowRDD = peopleRDD.map(new Function<String, Row>() { @Override public Row call(String record) throws Exception { String[] attributes = record.split(","); return RowFactory.create(attributes[0], attributes[1].trim()); } }); // Apply the schema to the RDD Dataset<Row> peopleDataFrame = spark.createDataFrame(rowRDD, schema); // Creates a temporary view using the DataFrame peopleDataFrame.createOrReplaceTempView("people"); // SQL can be run over a temporary view created using DataFrames Dataset<Row> results = spark.sql("SELECT name FROM people"); // The results of SQL queries are DataFrames and support all the normal RDD operations // The columns of a row in the result can be accessed by field index or by field name Dataset<String> namesDS = results.map(new MapFunction<Row, String>() { @Override public String call(Row row) throws Exception { return "Name: " + row.getString(0); } }, Encoders.STRING()); namesDS.show(); // +-------------+ // | value| // +-------------+ // |Name: Michael| // | Name: Andy| // | Name: Justin| // +-------------+ // $example off:programmatic_schema$ }
From source file:com.cambitc.spark.streaming.KafkaDirectStreamGrouping.java
License:Apache License
public static void main(String[] args) { if (args.length < 2) { System.err.println("Usage: KafkaDirectStream <brokers> <topics>\n" + " <brokers> is a list of one or more Kafka brokers\n" + " <topics> is a list of one or more kafka topics to consume from\n\n" + " KafkaDirectStream localhost:9092 OBDTopics"); System.exit(1);//from w ww . ja v a 2 s .com } //StreamingExamples.setStreamingLogLevels(); String brokers = args[0]; String topics = args[1]; // Create context with a 2 seconds batch interval //SparkConf sparkConf = new SparkConf().setAppName("JavaDirectKafkaWordCount"); JavaSparkContext sparkConf = new JavaSparkContext("local[5]", "JavaDirectKafkaWordCount"); JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, Durations.seconds(10)); SQLContext sqlContext = new SQLContext(sparkConf); HashSet<String> topicsSet = new HashSet<String>(Arrays.asList(topics.split(","))); HashMap<String, String> kafkaParams = new HashMap<String, String>(); kafkaParams.put("metadata.broker.list", brokers); kafkaParams.put("zookeeper.connect", "localhost:2181"); kafkaParams.put("group.id", "spark-app"); System.out.println("Kafka parameters: " + kafkaParams); // Create direct kafka stream with brokers and topics JavaPairInputDStream<String, String> messages = KafkaUtils.createDirectStream(jssc, String.class, String.class, StringDecoder.class, StringDecoder.class, kafkaParams, topicsSet); // Generate the schema based on the string of schema List<StructField> fields = new ArrayList<StructField>(); fields.add(DataTypes.createStructField("auctionid", DataTypes.StringType, true)); fields.add(DataTypes.createStructField("bid", DataTypes.FloatType, true)); fields.add(DataTypes.createStructField("bidtime", DataTypes.FloatType, true)); fields.add(DataTypes.createStructField("bidder", DataTypes.StringType, true)); fields.add(DataTypes.createStructField("bidderrate", DataTypes.IntegerType, true)); fields.add(DataTypes.createStructField("openbid", DataTypes.FloatType, true)); fields.add(DataTypes.createStructField("price", DataTypes.FloatType, true)); fields.add(DataTypes.createStructField("item", DataTypes.StringType, true)); fields.add(DataTypes.createStructField("daystolive", DataTypes.IntegerType, true)); StructType schema = DataTypes.createStructType(fields); // Get the lines, split them into words JavaDStream<String> lines = messages.map(new Function<Tuple2<String, String>, String>() { @Override public String call(Tuple2<String, String> tuple2) { System.out.println("*************MY OUTPUT: processing lines: tuple2._1() = " + tuple2._1() + "; tuple2._2()=" + tuple2._2()); return tuple2._2(); } }); lines.print(); //Creating Data Frame DataFrame dFrame = sqlContext.createDataFrame(lines, schema); JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() { @Override public Iterable<String> call(String x) { return Arrays.asList(SPACE.split(x)); } }); //words.print(); // Reduce function adding two integers, defined separately for clarity Function2<Integer, Integer, Integer> reduceFunc = new Function2<Integer, Integer, Integer>() { @Override public Integer call(Integer i1, Integer i2) { return i1 + i2; } }; // Count each word in each batch JavaPairDStream<String, Integer> pairs = words.mapToPair(new PairFunction<String, String, Integer>() { @Override public Tuple2<String, Integer> call(String s) { return new Tuple2<String, Integer>(s, 1); } }); /* JavaPairDStream<String, Integer> wordCounts = pairs.reduceByKey( new Function2<Integer, Integer, Integer>() { @Override public Integer call(Integer i1, Integer i2) { return i1 + i2; } }); /* */ // Reduce last 30 seconds of data, every 10 seconds JavaPairDStream<String, Integer> windowedWordCounts = pairs.reduceByKeyAndWindow(reduceFunc, Durations.seconds(30), Durations.seconds(10)); windowedWordCounts.print(); // Start the computation jssc.start(); jssc.awaitTermination(); }
From source file:com.estonteco.spark.frames.conf.factory.serializers.StructTypeJAXBAdapter.java
@Override public StructType unmarshal(ListColumns list) throws Exception { List<StructField> structFields = new ArrayList<StructField>(); for (ColumnInfo field : list.getColumns()) { DataType dt = DataType.fromCaseClassString(field.getType()); StructField columnType = DataTypes.createStructField(field.getName(), dt, field.isNullable()); structFields.add(columnType);//from w w w .j a va 2s. com } return DataTypes.createStructType(structFields); }
From source file:com.getcake.sparkjdbc.SparkJDBCServer.java
License:Apache License
private String loadSingleFileWithMeta(String registerTableName, String fullPathTableName, String metaFileName) throws IOException { DataFrame dynamicDataFrame;//from w w w. ja v a 2s. c o m long startTime, firstStartTime; float durSeconds, durMinutes; String respMsg; startTime = System.currentTimeMillis(); firstStartTime = startTime; try { dynamicDataFrame = hiveContext.table(registerTableName); respMsg = "table " + registerTableName + " at " + fullPathTableName + " was already loaded"; log(respMsg); return respMsg; } catch (Throwable exc) { // hiveContext.table does not declare that it throws NoSuchTableException, so cannot use it in catch clause and // have to check for it explicitly if (exc instanceof NoSuchTableException) { respMsg = "table " + registerTableName + " at " + fullPathTableName + " was not loaded => load it next"; log(respMsg); } else { throw exc; } } FileInputStream propFileInputStream; propFileInputStream = new FileInputStream(metaFileName); properties = new Properties(); properties.load(propFileInputStream); Stream<Entry<Object, Object>> stream = properties.entrySet().stream(); Map<String, String> options = stream.collect(Collectors.toMap(entry -> String.valueOf(entry.getKey()), entry -> String.valueOf(entry.getValue()))); int numColumns = Integer.parseInt(properties.getProperty("numColumns")); StructField structFields[] = new StructField[numColumns]; String colName, colType; StructField structField; for (int i = 1; i <= numColumns; i++) { colName = properties.getProperty("col" + i + ".name"); colType = properties.getProperty("col" + i + ".type"); switch (colType) { case "TimeStamp": structField = DataTypes.createStructField(colName, DataTypes.TimestampType, true); break; case "Date": structField = DataTypes.createStructField(colName, DataTypes.DateType, true); break; case "Float": structField = DataTypes.createStructField(colName, DataTypes.FloatType, true); break; case "Integer": structField = DataTypes.createStructField(colName, DataTypes.IntegerType, true); break; case "Long": structField = DataTypes.createStructField(colName, DataTypes.LongType, true); break; case "Short": structField = DataTypes.createStructField(colName, DataTypes.ShortType, true); break; case "Double": structField = DataTypes.createStructField(colName, DataTypes.DoubleType, true); break; case "Boolean": structField = DataTypes.createStructField(colName, DataTypes.BooleanType, true); break; case "Binary": structField = DataTypes.createStructField(colName, DataTypes.BinaryType, true); break; case "Byte": structField = DataTypes.createStructField(colName, DataTypes.ByteType, true); break; case "Null": structField = DataTypes.createStructField(colName, DataTypes.NullType, true); break; default: structField = DataTypes.createStructField(colName, DataTypes.StringType, true); } structFields[i - 1] = structField; } // dynamicDataFrame = hiveContext.read().format("com.databricks.spark.csv"). // option("header", Boolean.toString(headerInCSVFileFlag)).option("inferSchema", Boolean.toString(inferSchemaFlag)).load(fullPathTableName); // Map<String, String> options = new HashMap<>(properties); options.put("path", "file:///" + fullPathTableName); // options.put("header", "false"); // options.put("delimiter", ","); // DataType dataType = new DataType (); /* StructField structField1 = DataTypes.createStructField("LogType", DataTypes.StringType, false); StructField structField2 = DataTypes.createStructField("EntryTime", DataTypes.TimestampType, false); StructField structField3 = DataTypes.createStructField("Code_Class", DataTypes.StringType, false); StructField structField4 = DataTypes.createStructField("Code_Method", DataTypes.StringType, false); StructField structField5 = DataTypes.createStructField("Log_Message", DataTypes.StringType, false); structFields[0] = structField1; structFields[1] = structField2; structFields[2] = structField3; structFields[3] = structField4; structFields[4] = structField5; */ StructType schema = new StructType(structFields); dynamicDataFrame = hiveContext.load("com.databricks.spark.csv", schema, options); durSeconds = (float) (System.currentTimeMillis() - startTime) / 1000F; durMinutes = durSeconds / 60F; log("loaded table " + fullPathTableName + " in seconds: " + durSeconds + " / in minutes: " + durMinutes); schema = dynamicDataFrame.schema(); structFields = schema.fields(); for (StructField structFieldLocal : structFields) { DataType dataType = structFieldLocal.dataType(); logger.debug(structFieldLocal.name() + " - dataType: " + dataType.typeName()); } startTime = System.currentTimeMillis(); dynamicDataFrame.cache(); durSeconds = (float) (System.currentTimeMillis() - startTime) / 1000F; durMinutes = durSeconds / 60F; log("cache table " + fullPathTableName + " in seconds: " + durSeconds + " / in minutes: " + durMinutes); startTime = System.currentTimeMillis(); dynamicDataFrame.registerTempTable(registerTableName); durSeconds = (float) (System.currentTimeMillis() - startTime) / 1000F; durMinutes = durSeconds / 60F; log("registerTempTable table " + registerTableName + " in seconds: " + durSeconds + " / in minutes: " + durMinutes); durSeconds = (float) (System.currentTimeMillis() - firstStartTime) / 1000F; durMinutes = durSeconds / 60F; respMsg = "Completed loading table " + fullPathTableName + " in seconds: " + durSeconds + " / in minutes: " + durMinutes; log(respMsg); return respMsg; }
From source file:com.getcake.sparkjdbc.SparkJDBCServer.java
License:Apache License
private String loadFilesWithMeta(String registerTableName, String fullPathTableName, String metaFileName, String fileListName) throws IOException { DataFrame combinedDynamicDataFrame = null, dynamicDataFrame = null; long startTime, firstStartTime; float durSeconds, durMinutes; String respMsg;/*from ww w .ja v a 2 s.co m*/ startTime = System.currentTimeMillis(); firstStartTime = startTime; try { combinedDynamicDataFrame = hiveContext.table(registerTableName); respMsg = "table " + registerTableName + " at " + fullPathTableName + " was already loaded"; log(respMsg); return respMsg; } catch (Throwable exc) { // hiveContext.table does not declare that it throws NoSuchTableException, so cannot use it in catch clause and // have to check for it explicitly if (exc instanceof NoSuchTableException) { respMsg = "table " + registerTableName + " at " + fullPathTableName + " was not loaded => load it next"; log(respMsg); } else { throw exc; } } FileInputStream propFileInputStream; propFileInputStream = new FileInputStream(metaFileName); properties = new Properties(); properties.load(propFileInputStream); Stream<Entry<Object, Object>> stream = properties.entrySet().stream(); Map<String, String> options = stream.collect(Collectors.toMap(entry -> String.valueOf(entry.getKey()), entry -> String.valueOf(entry.getValue()))); int numColumns = Integer.parseInt(properties.getProperty("numColumns")); StructField structFields[] = new StructField[numColumns]; String colName, colType; StructField structField; // structField = DataTypes.createStructField("File_Source", DataTypes.StringType, true); // structFields[0] = structField; for (int i = 1; i <= numColumns; i++) { colName = properties.getProperty("col" + i + ".name"); colType = properties.getProperty("col" + i + ".type"); switch (colType) { case "TimeStamp": structField = DataTypes.createStructField(colName, DataTypes.TimestampType, true); break; case "Date": structField = DataTypes.createStructField(colName, DataTypes.DateType, true); break; case "Float": structField = DataTypes.createStructField(colName, DataTypes.FloatType, true); break; case "Integer": structField = DataTypes.createStructField(colName, DataTypes.IntegerType, true); break; case "Long": structField = DataTypes.createStructField(colName, DataTypes.LongType, true); break; case "Short": structField = DataTypes.createStructField(colName, DataTypes.ShortType, true); break; case "Double": structField = DataTypes.createStructField(colName, DataTypes.DoubleType, true); break; case "Boolean": structField = DataTypes.createStructField(colName, DataTypes.BooleanType, true); break; case "Binary": structField = DataTypes.createStructField(colName, DataTypes.BinaryType, true); break; case "Byte": structField = DataTypes.createStructField(colName, DataTypes.ByteType, true); break; case "Null": structField = DataTypes.createStructField(colName, DataTypes.NullType, true); break; default: structField = DataTypes.createStructField(colName, DataTypes.StringType, true); } structFields[i - 1] = structField; } StructType schema = new StructType(structFields); List<String> fileLlist = new ArrayList<>(); try (BufferedReader br = Files.newBufferedReader(Paths.get(fileListName))) { //br returns as stream and convert it into a List fileLlist = br.lines().collect(Collectors.toList()); } catch (IOException e) { e.printStackTrace(); } for (String file : fileLlist) { options.put("path", "file:///" + file); dynamicDataFrame = hiveContext.load("com.databricks.spark.csv", schema, options); if (combinedDynamicDataFrame == null) { combinedDynamicDataFrame = dynamicDataFrame; } else { combinedDynamicDataFrame = combinedDynamicDataFrame.unionAll(dynamicDataFrame); } } durSeconds = (float) (System.currentTimeMillis() - startTime) / 1000F; durMinutes = durSeconds / 60F; log("loaded table " + fullPathTableName + " in seconds: " + durSeconds + " / in minutes: " + durMinutes); schema = combinedDynamicDataFrame.schema(); structFields = schema.fields(); for (StructField structFieldLocal : structFields) { DataType dataType = structFieldLocal.dataType(); logger.debug(structFieldLocal.name() + " - dataType: " + dataType.typeName()); } startTime = System.currentTimeMillis(); combinedDynamicDataFrame.cache(); durSeconds = (float) (System.currentTimeMillis() - startTime) / 1000F; durMinutes = durSeconds / 60F; log("cache table " + fullPathTableName + " in seconds: " + durSeconds + " / in minutes: " + durMinutes); startTime = System.currentTimeMillis(); combinedDynamicDataFrame.registerTempTable(registerTableName); durSeconds = (float) (System.currentTimeMillis() - startTime) / 1000F; durMinutes = durSeconds / 60F; log("registerTempTable table " + registerTableName + " in seconds: " + durSeconds + " / in minutes: " + durMinutes); durSeconds = (float) (System.currentTimeMillis() - firstStartTime) / 1000F; durMinutes = durSeconds / 60F; respMsg = "Completed loading table " + fullPathTableName + " in seconds: " + durSeconds + " / in minutes: " + durMinutes; log(respMsg); return respMsg; }
From source file:com.hxr.bigdata.spark.example141.JavaSparkSQL.java
License:Apache License
public static void main(final String[] args) throws Exception { SparkConf sparkConf = new SparkConf().setAppName("JavaSparkSQL"); JavaSparkContext ctx = new JavaSparkContext(sparkConf); SQLContext sqlContext = new SQLContext(ctx); System.out.println("=== Data source: RDD ==="); // Load a text file and convert each line to a Java Bean. // ?javabean? // hdfs://127.0.0.1:9000/spark/people.txt JavaRDD<Person> people = ctx.textFile("/spark/people.txt").map(new Function<String, Person>() { public Person call(final String line) { String[] parts = line.split(","); Person person = new Person(); person.setName(parts[0]);/* w w w .j a v a2s . co m*/ person.setAge(Integer.parseInt(parts[1].trim())); return person; } }); // Apply a schema to an RDD of Java Beans and register it as a table. // schema?javabeanRDD DataFrame schemaPeople = sqlContext.createDataFrame(people, Person.class); schemaPeople.registerTempTable("people"); // SQL can be run over RDDs that have been registered as tables. // ??sql DataFrame teenagers = sqlContext.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19"); // The results of SQL queries are DataFrames and support all the normal RDD operations. // The columns of a row in the result can be accessed by ordinal. // DataFrame?RDD?RDD? List<String> teenagerNames = teenagers.toJavaRDD().map(new Function<Row, String>() { public String call(final Row row) { return "Name: " + row.getString(0); } }).collect(); for (String name : teenagerNames) { System.out.println(name); } // ------------------------??javabean-------------------------- // The schema is encoded in a string String schemaString = "name age"; // Generate the schema based on the string of schema List<StructField> fields = new ArrayList<StructField>(); for (String fieldName : schemaString.split(" ")) { fields.add(DataTypes.createStructField(fieldName, DataTypes.StringType, true)); } StructType schema = DataTypes.createStructType(fields); // Load a text file and convert each line to a JavaBean. JavaRDD<String> peopleT = ctx.textFile("/spark/people.txt"); // Convert records of the RDD (people) to Rows. JavaRDD<Row> rowRDD = peopleT.map(new Function<String, Row>() { public Row call(final String record) throws Exception { String[] fields = record.split(","); return RowFactory.create(fields[0], fields[1].trim()); } }); // Apply the schema to the RDD. DataFrame peopleDataFrame = sqlContext.createDataFrame(rowRDD, schema); // Register the DataFrame as a table. peopleDataFrame.registerTempTable("people"); // SQL can be run over RDDs that have been registered as tables. DataFrame results = sqlContext.sql("SELECT name FROM people"); // The results of SQL queries are DataFrames and support all the normal RDD operations. // The columns of a row in the result can be accessed by ordinal. List<String> names = results.javaRDD().map(new Function<Row, String>() { public String call(final Row row) { return "Name: " + row.getString(0); } }).collect(); System.out.println("=== Data source: Parquet File ==="); // DataFrames can be saved as parquet files, maintaining the schema information. // hdfs??hdfs://127.0.0.1:9000/user/hanxirui/people.parquet // SaveMode.ErrorIfExists (default) When saving a DataFrame to a data source, if data already exists, an exception is expected to be thrown. // SaveMode.Append When saving a DataFrame to a data source, if data/table already exists, contents of the DataFrame are expected to be appended to existing data. // SaveMode.Overwrite Overwrite mode means that when saving a DataFrame to a data source, if data/table already exists, existing data is expected to be overwritten by the contents of the DataFrame. // SaveMode.Ignore Ignore mode means that when saving a DataFrame to a data source, if data already exists, the save operation is expected to not save the contents of the DataFrame and to not change the existing data. This is similar to a CREATE TABLE IF NOT EXISTS in SQL. schemaPeople.write().mode(SaveMode.Ignore).parquet("people.parquet"); // Read in the parquet file created above. // Parquet files are self-describing so the schema is preserved. // The result of loading a parquet file is also a DataFrame. DataFrame parquetFile = sqlContext.read().parquet("people.parquet"); // Parquet files can also be registered as tables and then used in SQL statements. parquetFile.registerTempTable("parquetFile"); DataFrame teenagers2 = sqlContext.sql("SELECT name FROM parquetFile WHERE age >= 13 AND age <= 19"); teenagerNames = teenagers2.toJavaRDD().map(new Function<Row, String>() { public String call(final Row row) { return "Name: " + row.getString(0); } }).collect(); for (String name : teenagerNames) { System.out.println(name); } System.out.println("=== Data source: JSON Dataset ==="); // A JSON dataset is pointed by path. // The path can be either a single text file or a directory storing text files. String path = "/spark/people.json"; // Create a DataFrame from the file(s) pointed by path DataFrame peopleFromJsonFile = sqlContext.read().json(path); // Because the schema of a JSON dataset is automatically inferred, to write queries, // it is better to take a look at what is the schema. peopleFromJsonFile.printSchema(); // The schema of people is ... // root // |-- age: IntegerType // |-- name: StringType // Register this DataFrame as a table. peopleFromJsonFile.registerTempTable("people"); // SQL statements can be run by using the sql methods provided by sqlContext. DataFrame teenagers3 = sqlContext.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19"); // The results of SQL queries are DataFrame and support all the normal RDD operations. // The columns of a row in the result can be accessed by ordinal. teenagerNames = teenagers3.toJavaRDD().map(new Function<Row, String>() { public String call(final Row row) { return "Name: " + row.getString(0); } }).collect(); for (String name : teenagerNames) { System.out.println(name); } // Alternatively, a DataFrame can be created for a JSON dataset represented by // a RDD[String] storing one JSON object per string. List<String> jsonData = Arrays .asList("{\"name\":\"Yin\",\"address\":{\"city\":\"Columbus\",\"state\":\"Ohio\"}}"); JavaRDD<String> anotherPeopleRDD = ctx.parallelize(jsonData); DataFrame peopleFromJsonRDD = sqlContext.read().json(anotherPeopleRDD.rdd()); // Take a look at the schema of this new DataFrame. peopleFromJsonRDD.printSchema(); // The schema of anotherPeople is ... // root // |-- address: StructType // | |-- city: StringType // | |-- state: StringType // |-- name: StringType peopleFromJsonRDD.registerTempTable("people2"); DataFrame peopleWithCity = sqlContext.sql("SELECT name, address.city FROM people2"); List<String> nameAndCity = peopleWithCity.toJavaRDD().map(new Function<Row, String>() { public String call(final Row row) { return "Name: " + row.getString(0) + ", City: " + row.getString(1); } }).collect(); for (String name : nameAndCity) { System.out.println(name); } ctx.stop(); }
From source file:com.ibm.bi.dml.api.MLOutput.java
License:Open Source License
/** * This methods improves the performance of MLPipeline wrappers. * @param sqlContext//from w ww. j a v a 2s. c o m * @param varName * @param range range is inclusive * @return * @throws DMLRuntimeException */ public DataFrame getDF(SQLContext sqlContext, String varName, HashMap<String, Tuple2<Long, Long>> range) throws DMLRuntimeException { JavaPairRDD<MatrixIndexes, MatrixBlock> binaryBlockRDD = getBinaryBlockedRDD(varName); if (binaryBlockRDD == null) { throw new DMLRuntimeException("Variable " + varName + " not found in the output symbol table."); } MatrixCharacteristics mc = _outMetadata.get(varName); long rlen = mc.getRows(); long clen = mc.getCols(); int brlen = mc.getRowsPerBlock(); int bclen = mc.getColsPerBlock(); ArrayList<Tuple2<String, Tuple2<Long, Long>>> alRange = new ArrayList<Tuple2<String, Tuple2<Long, Long>>>(); for (Entry<String, Tuple2<Long, Long>> e : range.entrySet()) { alRange.add(new Tuple2<String, Tuple2<Long, Long>>(e.getKey(), e.getValue())); } // Very expensive operation here: groupByKey (where number of keys might be too large) JavaRDD<Row> rowsRDD = binaryBlockRDD.flatMapToPair(new ProjectRows(rlen, clen, brlen, bclen)).groupByKey() .map(new ConvertDoubleArrayToRangeRows(clen, bclen, alRange)); int numColumns = (int) clen; if (numColumns <= 0) { throw new DMLRuntimeException( "Output dimensions unknown after executing the script and hence cannot create the dataframe"); } List<StructField> fields = new ArrayList<StructField>(); // LongTypes throw an error: java.lang.Double incompatible with java.lang.Long fields.add(DataTypes.createStructField("ID", DataTypes.DoubleType, false)); for (int k = 0; k < alRange.size(); k++) { String colName = alRange.get(k)._1; long low = alRange.get(k)._2._1; long high = alRange.get(k)._2._2; if (low != high) fields.add(DataTypes.createStructField(colName, new VectorUDT(), false)); else fields.add(DataTypes.createStructField(colName, DataTypes.DoubleType, false)); } // This will cause infinite recursion due to bug in Spark // https://issues.apache.org/jira/browse/SPARK-6999 // return sqlContext.createDataFrame(rowsRDD, colNames); // where ArrayList<String> colNames return sqlContext.createDataFrame(rowsRDD.rdd(), DataTypes.createStructType(fields)); }