List of usage examples for org.apache.spark.sql.types DataTypes createStructType
public static StructType createStructType(StructField[] fields)
From source file:KafkaSparkMongo.java
License:Apache License
public static void main(String[] args) throws Exception { if (args.length < 2) { System.err.println("Usage: JavaDirectKafkaWordCount <brokers> <topics>\n" + " <brokers> is a list of one or more Kafka brokers\n" + " <topics> is a list of one or more kafka topics to consume from\n\n"); System.exit(1);//from w ww.ja v a2s . c o m } String brokers = args[0]; String topics = args[1]; String UriMongo = "mongodb://localhost/streamSparkFinal.coll"; dropDatabase(UriMongo); // Create the context with a 1 second batch size SparkConf sparkConf = new SparkConf().setAppName("JavaNetworkWordCount") .set("spark.app.id", "MongoSparkConnectorTour").set("spark.mongodb.input.uri", UriMongo) .set("spark.mongodb.output.uri", UriMongo); JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, Durations.seconds(5)); /** Create a JavaReceiverInputDStream on target ip:port and count the * words in input stream of \n delimited text (eg. generated by 'nc') * Note that no duplication in storage level only for running locally. * Replication necessary in distributed scenario for fault tolerance. */ Set<String> topicsSet = new HashSet<>(Arrays.asList(topics.split(","))); Map<String, String> kafkaParams = new HashMap<>(); kafkaParams.put("metadata.broker.list", brokers); // Create direct kafka stream with brokers and topics JavaPairInputDStream<String, String> messages = KafkaUtils.createDirectStream(ssc, String.class, String.class, StringDecoder.class, StringDecoder.class, kafkaParams, topicsSet); messages.print(); JavaDStream<String> lines = messages.map(x -> x._2()); JavaDStream<Tuple7<String, String, String, String, String, String, String>> words = lines.map(y -> { String[] wordy = SPACE.split(y); return new Tuple7<>(wordy[0], wordy[1], wordy[2], wordy[3], wordy[4], wordy[5], wordy[6]); }); words.foreachRDD(rdd -> { List<StructField> subFields = new ArrayList<>(); subFields.add(DataTypes.createStructField("X", DataTypes.DoubleType, true)); subFields.add(DataTypes.createStructField("Y", DataTypes.DoubleType, true)); subFields.add(DataTypes.createStructField("z", DataTypes.DoubleType, true)); List<StructField> fields = new ArrayList<>(); fields.add(DataTypes.createStructField("Serial", DataTypes.StringType, true)); fields.add(DataTypes.createStructField("Zone", DataTypes.StringType, true)); fields.add(DataTypes.createStructField("Group", DataTypes.StringType, true)); fields.add(DataTypes.createStructField("coord", DataTypes.createStructType(subFields), true)); fields.add(DataTypes.createStructField("Time", DataTypes.TimestampType, true)); StructType schema = DataTypes.createStructType(fields); SparkSession spark = JavaSparkSessionSingleton.getInstance(rdd.context().getConf()); JavaRDD<Row> rowRDD = rdd .map(palabra -> RowFactory.create(palabra._1(), palabra._2(), palabra._3(), RowFactory.create(Double.parseDouble(palabra._4()), Double.parseDouble(palabra._5()), Double.parseDouble(palabra._6())), Timestamp.from(Instant.parse(palabra._7())))); Dataset<Row> wordsDataFrame = spark.createDataFrame(rowRDD, schema); wordsDataFrame.show(); MongoSpark.write(wordsDataFrame).option("collection", "pruebaF").mode("append").save(); }); ssc.start(); ssc.awaitTermination(); }
From source file:com.alpine.plugin.samples.ver1_0.JavaCountPlugin.CountPluginSparkJob.java
License:Open Source License
public DataFrame transform(OperatorParameters params, DataFrame inputDataFrame, SparkRuntimeUtils sparkUtils, OperatorListener listener) {/*from ww w .j a v a 2 s . c o m*/ String groupByVar = params.getTabularDatasetSelectedColumn(GroupByParamKey)._2(); listener.notifyMessage("Starting the DataFrame Transformation"); DataFrame selectedData = inputDataFrame.select(groupByVar); DataFrame df = selectedData.groupBy(groupByVar).count(); //customize the output schema List<StructField> fields = new ArrayList<StructField>(); fields.add(DataTypes.createStructField(groupByVar, DataTypes.StringType, true)); fields.add(DataTypes.createStructField("GroupCount", DataTypes.LongType, true)); StructType dfSchema = DataTypes.createStructType(fields); return inputDataFrame.sqlContext().createDataFrame(df.rdd(), dfSchema); }
From source file:com.andado.spark.examples.ml.JavaElementwiseProductExample.java
License:Apache License
public static void main(String[] args) { SparkSession spark = SparkSession.builder().appName("JavaElementwiseProductExample").getOrCreate(); // $example on$ // Create some vector data; also works for sparse vectors List<Row> data = Arrays.asList(RowFactory.create("a", Vectors.dense(1.0, 2.0, 3.0)), RowFactory.create("b", Vectors.dense(4.0, 5.0, 6.0))); List<StructField> fields = new ArrayList<>(2); fields.add(DataTypes.createStructField("id", DataTypes.StringType, false)); fields.add(DataTypes.createStructField("vector", new VectorUDT(), false)); StructType schema = DataTypes.createStructType(fields); Dataset<Row> dataFrame = spark.createDataFrame(data, schema); Vector transformingVector = Vectors.dense(0.0, 1.0, 2.0); ElementwiseProduct transformer = new ElementwiseProduct().setScalingVec(transformingVector) .setInputCol("vector").setOutputCol("transformedVector"); // Batch transform the vectors to create new column: transformer.transform(dataFrame).show(); // $example off$ spark.stop();/* w w w. j av a2s . c o m*/ }
From source file:com.andado.spark.examples.sql.JavaSparkSQLExample.java
License:Apache License
private static void runProgrammaticSchemaExample(SparkSession spark) { // $example on:programmatic_schema$ // Create an RDD JavaRDD<String> peopleRDD = spark.sparkContext().textFile("examples/src/main/resources/people.txt", 1) .toJavaRDD();/*from w ww . ja va2 s . c o m*/ // The schema is encoded in a string String schemaString = "name age"; // Generate the schema based on the string of schema List<StructField> fields = new ArrayList<>(); for (String fieldName : schemaString.split(" ")) { StructField field = DataTypes.createStructField(fieldName, DataTypes.StringType, true); fields.add(field); } StructType schema = DataTypes.createStructType(fields); // Convert records of the RDD (people) to Rows JavaRDD<Row> rowRDD = peopleRDD.map(new Function<String, Row>() { @Override public Row call(String record) throws Exception { String[] attributes = record.split(","); return RowFactory.create(attributes[0], attributes[1].trim()); } }); // Apply the schema to the RDD Dataset<Row> peopleDataFrame = spark.createDataFrame(rowRDD, schema); // Creates a temporary view using the DataFrame peopleDataFrame.createOrReplaceTempView("people"); // SQL can be run over a temporary view created using DataFrames Dataset<Row> results = spark.sql("SELECT name FROM people"); // The results of SQL queries are DataFrames and support all the normal RDD operations // The columns of a row in the result can be accessed by field index or by field name Dataset<String> namesDS = results.map(new MapFunction<Row, String>() { @Override public String call(Row row) throws Exception { return "Name: " + row.getString(0); } }, Encoders.STRING()); namesDS.show(); // +-------------+ // | value| // +-------------+ // |Name: Michael| // | Name: Andy| // | Name: Justin| // +-------------+ // $example off:programmatic_schema$ }
From source file:com.cambitc.spark.streaming.KafkaDirectStreamGrouping.java
License:Apache License
public static void main(String[] args) { if (args.length < 2) { System.err.println("Usage: KafkaDirectStream <brokers> <topics>\n" + " <brokers> is a list of one or more Kafka brokers\n" + " <topics> is a list of one or more kafka topics to consume from\n\n" + " KafkaDirectStream localhost:9092 OBDTopics"); System.exit(1);// w w w .ja va 2 s .co m } //StreamingExamples.setStreamingLogLevels(); String brokers = args[0]; String topics = args[1]; // Create context with a 2 seconds batch interval //SparkConf sparkConf = new SparkConf().setAppName("JavaDirectKafkaWordCount"); JavaSparkContext sparkConf = new JavaSparkContext("local[5]", "JavaDirectKafkaWordCount"); JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, Durations.seconds(10)); SQLContext sqlContext = new SQLContext(sparkConf); HashSet<String> topicsSet = new HashSet<String>(Arrays.asList(topics.split(","))); HashMap<String, String> kafkaParams = new HashMap<String, String>(); kafkaParams.put("metadata.broker.list", brokers); kafkaParams.put("zookeeper.connect", "localhost:2181"); kafkaParams.put("group.id", "spark-app"); System.out.println("Kafka parameters: " + kafkaParams); // Create direct kafka stream with brokers and topics JavaPairInputDStream<String, String> messages = KafkaUtils.createDirectStream(jssc, String.class, String.class, StringDecoder.class, StringDecoder.class, kafkaParams, topicsSet); // Generate the schema based on the string of schema List<StructField> fields = new ArrayList<StructField>(); fields.add(DataTypes.createStructField("auctionid", DataTypes.StringType, true)); fields.add(DataTypes.createStructField("bid", DataTypes.FloatType, true)); fields.add(DataTypes.createStructField("bidtime", DataTypes.FloatType, true)); fields.add(DataTypes.createStructField("bidder", DataTypes.StringType, true)); fields.add(DataTypes.createStructField("bidderrate", DataTypes.IntegerType, true)); fields.add(DataTypes.createStructField("openbid", DataTypes.FloatType, true)); fields.add(DataTypes.createStructField("price", DataTypes.FloatType, true)); fields.add(DataTypes.createStructField("item", DataTypes.StringType, true)); fields.add(DataTypes.createStructField("daystolive", DataTypes.IntegerType, true)); StructType schema = DataTypes.createStructType(fields); // Get the lines, split them into words JavaDStream<String> lines = messages.map(new Function<Tuple2<String, String>, String>() { @Override public String call(Tuple2<String, String> tuple2) { System.out.println("*************MY OUTPUT: processing lines: tuple2._1() = " + tuple2._1() + "; tuple2._2()=" + tuple2._2()); return tuple2._2(); } }); lines.print(); //Creating Data Frame DataFrame dFrame = sqlContext.createDataFrame(lines, schema); JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() { @Override public Iterable<String> call(String x) { return Arrays.asList(SPACE.split(x)); } }); //words.print(); // Reduce function adding two integers, defined separately for clarity Function2<Integer, Integer, Integer> reduceFunc = new Function2<Integer, Integer, Integer>() { @Override public Integer call(Integer i1, Integer i2) { return i1 + i2; } }; // Count each word in each batch JavaPairDStream<String, Integer> pairs = words.mapToPair(new PairFunction<String, String, Integer>() { @Override public Tuple2<String, Integer> call(String s) { return new Tuple2<String, Integer>(s, 1); } }); /* JavaPairDStream<String, Integer> wordCounts = pairs.reduceByKey( new Function2<Integer, Integer, Integer>() { @Override public Integer call(Integer i1, Integer i2) { return i1 + i2; } }); /* */ // Reduce last 30 seconds of data, every 10 seconds JavaPairDStream<String, Integer> windowedWordCounts = pairs.reduceByKeyAndWindow(reduceFunc, Durations.seconds(30), Durations.seconds(10)); windowedWordCounts.print(); // Start the computation jssc.start(); jssc.awaitTermination(); }
From source file:com.estonteco.spark.frames.conf.factory.serializers.StructTypeJAXBAdapter.java
@Override public StructType unmarshal(ListColumns list) throws Exception { List<StructField> structFields = new ArrayList<StructField>(); for (ColumnInfo field : list.getColumns()) { DataType dt = DataType.fromCaseClassString(field.getType()); StructField columnType = DataTypes.createStructField(field.getName(), dt, field.isNullable()); structFields.add(columnType);//from w ww . j a va 2 s. com } return DataTypes.createStructType(structFields); }
From source file:com.hxr.bigdata.spark.example141.JavaSparkSQL.java
License:Apache License
public static void main(final String[] args) throws Exception { SparkConf sparkConf = new SparkConf().setAppName("JavaSparkSQL"); JavaSparkContext ctx = new JavaSparkContext(sparkConf); SQLContext sqlContext = new SQLContext(ctx); System.out.println("=== Data source: RDD ==="); // Load a text file and convert each line to a Java Bean. // ?javabean? // hdfs://127.0.0.1:9000/spark/people.txt JavaRDD<Person> people = ctx.textFile("/spark/people.txt").map(new Function<String, Person>() { public Person call(final String line) { String[] parts = line.split(","); Person person = new Person(); person.setName(parts[0]);// w ww.jav a2s . c o m person.setAge(Integer.parseInt(parts[1].trim())); return person; } }); // Apply a schema to an RDD of Java Beans and register it as a table. // schema?javabeanRDD DataFrame schemaPeople = sqlContext.createDataFrame(people, Person.class); schemaPeople.registerTempTable("people"); // SQL can be run over RDDs that have been registered as tables. // ??sql DataFrame teenagers = sqlContext.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19"); // The results of SQL queries are DataFrames and support all the normal RDD operations. // The columns of a row in the result can be accessed by ordinal. // DataFrame?RDD?RDD? List<String> teenagerNames = teenagers.toJavaRDD().map(new Function<Row, String>() { public String call(final Row row) { return "Name: " + row.getString(0); } }).collect(); for (String name : teenagerNames) { System.out.println(name); } // ------------------------??javabean-------------------------- // The schema is encoded in a string String schemaString = "name age"; // Generate the schema based on the string of schema List<StructField> fields = new ArrayList<StructField>(); for (String fieldName : schemaString.split(" ")) { fields.add(DataTypes.createStructField(fieldName, DataTypes.StringType, true)); } StructType schema = DataTypes.createStructType(fields); // Load a text file and convert each line to a JavaBean. JavaRDD<String> peopleT = ctx.textFile("/spark/people.txt"); // Convert records of the RDD (people) to Rows. JavaRDD<Row> rowRDD = peopleT.map(new Function<String, Row>() { public Row call(final String record) throws Exception { String[] fields = record.split(","); return RowFactory.create(fields[0], fields[1].trim()); } }); // Apply the schema to the RDD. DataFrame peopleDataFrame = sqlContext.createDataFrame(rowRDD, schema); // Register the DataFrame as a table. peopleDataFrame.registerTempTable("people"); // SQL can be run over RDDs that have been registered as tables. DataFrame results = sqlContext.sql("SELECT name FROM people"); // The results of SQL queries are DataFrames and support all the normal RDD operations. // The columns of a row in the result can be accessed by ordinal. List<String> names = results.javaRDD().map(new Function<Row, String>() { public String call(final Row row) { return "Name: " + row.getString(0); } }).collect(); System.out.println("=== Data source: Parquet File ==="); // DataFrames can be saved as parquet files, maintaining the schema information. // hdfs??hdfs://127.0.0.1:9000/user/hanxirui/people.parquet // SaveMode.ErrorIfExists (default) When saving a DataFrame to a data source, if data already exists, an exception is expected to be thrown. // SaveMode.Append When saving a DataFrame to a data source, if data/table already exists, contents of the DataFrame are expected to be appended to existing data. // SaveMode.Overwrite Overwrite mode means that when saving a DataFrame to a data source, if data/table already exists, existing data is expected to be overwritten by the contents of the DataFrame. // SaveMode.Ignore Ignore mode means that when saving a DataFrame to a data source, if data already exists, the save operation is expected to not save the contents of the DataFrame and to not change the existing data. This is similar to a CREATE TABLE IF NOT EXISTS in SQL. schemaPeople.write().mode(SaveMode.Ignore).parquet("people.parquet"); // Read in the parquet file created above. // Parquet files are self-describing so the schema is preserved. // The result of loading a parquet file is also a DataFrame. DataFrame parquetFile = sqlContext.read().parquet("people.parquet"); // Parquet files can also be registered as tables and then used in SQL statements. parquetFile.registerTempTable("parquetFile"); DataFrame teenagers2 = sqlContext.sql("SELECT name FROM parquetFile WHERE age >= 13 AND age <= 19"); teenagerNames = teenagers2.toJavaRDD().map(new Function<Row, String>() { public String call(final Row row) { return "Name: " + row.getString(0); } }).collect(); for (String name : teenagerNames) { System.out.println(name); } System.out.println("=== Data source: JSON Dataset ==="); // A JSON dataset is pointed by path. // The path can be either a single text file or a directory storing text files. String path = "/spark/people.json"; // Create a DataFrame from the file(s) pointed by path DataFrame peopleFromJsonFile = sqlContext.read().json(path); // Because the schema of a JSON dataset is automatically inferred, to write queries, // it is better to take a look at what is the schema. peopleFromJsonFile.printSchema(); // The schema of people is ... // root // |-- age: IntegerType // |-- name: StringType // Register this DataFrame as a table. peopleFromJsonFile.registerTempTable("people"); // SQL statements can be run by using the sql methods provided by sqlContext. DataFrame teenagers3 = sqlContext.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19"); // The results of SQL queries are DataFrame and support all the normal RDD operations. // The columns of a row in the result can be accessed by ordinal. teenagerNames = teenagers3.toJavaRDD().map(new Function<Row, String>() { public String call(final Row row) { return "Name: " + row.getString(0); } }).collect(); for (String name : teenagerNames) { System.out.println(name); } // Alternatively, a DataFrame can be created for a JSON dataset represented by // a RDD[String] storing one JSON object per string. List<String> jsonData = Arrays .asList("{\"name\":\"Yin\",\"address\":{\"city\":\"Columbus\",\"state\":\"Ohio\"}}"); JavaRDD<String> anotherPeopleRDD = ctx.parallelize(jsonData); DataFrame peopleFromJsonRDD = sqlContext.read().json(anotherPeopleRDD.rdd()); // Take a look at the schema of this new DataFrame. peopleFromJsonRDD.printSchema(); // The schema of anotherPeople is ... // root // |-- address: StructType // | |-- city: StringType // | |-- state: StringType // |-- name: StringType peopleFromJsonRDD.registerTempTable("people2"); DataFrame peopleWithCity = sqlContext.sql("SELECT name, address.city FROM people2"); List<String> nameAndCity = peopleWithCity.toJavaRDD().map(new Function<Row, String>() { public String call(final Row row) { return "Name: " + row.getString(0) + ", City: " + row.getString(1); } }).collect(); for (String name : nameAndCity) { System.out.println(name); } ctx.stop(); }
From source file:com.ibm.bi.dml.api.MLOutput.java
License:Open Source License
/** * This methods improves the performance of MLPipeline wrappers. * @param sqlContext//from ww w .jav a2s . co m * @param varName * @param range range is inclusive * @return * @throws DMLRuntimeException */ public DataFrame getDF(SQLContext sqlContext, String varName, HashMap<String, Tuple2<Long, Long>> range) throws DMLRuntimeException { JavaPairRDD<MatrixIndexes, MatrixBlock> binaryBlockRDD = getBinaryBlockedRDD(varName); if (binaryBlockRDD == null) { throw new DMLRuntimeException("Variable " + varName + " not found in the output symbol table."); } MatrixCharacteristics mc = _outMetadata.get(varName); long rlen = mc.getRows(); long clen = mc.getCols(); int brlen = mc.getRowsPerBlock(); int bclen = mc.getColsPerBlock(); ArrayList<Tuple2<String, Tuple2<Long, Long>>> alRange = new ArrayList<Tuple2<String, Tuple2<Long, Long>>>(); for (Entry<String, Tuple2<Long, Long>> e : range.entrySet()) { alRange.add(new Tuple2<String, Tuple2<Long, Long>>(e.getKey(), e.getValue())); } // Very expensive operation here: groupByKey (where number of keys might be too large) JavaRDD<Row> rowsRDD = binaryBlockRDD.flatMapToPair(new ProjectRows(rlen, clen, brlen, bclen)).groupByKey() .map(new ConvertDoubleArrayToRangeRows(clen, bclen, alRange)); int numColumns = (int) clen; if (numColumns <= 0) { throw new DMLRuntimeException( "Output dimensions unknown after executing the script and hence cannot create the dataframe"); } List<StructField> fields = new ArrayList<StructField>(); // LongTypes throw an error: java.lang.Double incompatible with java.lang.Long fields.add(DataTypes.createStructField("ID", DataTypes.DoubleType, false)); for (int k = 0; k < alRange.size(); k++) { String colName = alRange.get(k)._1; long low = alRange.get(k)._2._1; long high = alRange.get(k)._2._2; if (low != high) fields.add(DataTypes.createStructField(colName, new VectorUDT(), false)); else fields.add(DataTypes.createStructField(colName, DataTypes.DoubleType, false)); } // This will cause infinite recursion due to bug in Spark // https://issues.apache.org/jira/browse/SPARK-6999 // return sqlContext.createDataFrame(rowsRDD, colNames); // where ArrayList<String> colNames return sqlContext.createDataFrame(rowsRDD.rdd(), DataTypes.createStructType(fields)); }
From source file:com.ibm.bi.dml.runtime.instructions.spark.utils.RDDConverterUtilsExt.java
License:Open Source License
public static DataFrame binaryBlockToVectorDataFrame(JavaPairRDD<MatrixIndexes, MatrixBlock> binaryBlockRDD, MatrixCharacteristics mc, SQLContext sqlContext) throws DMLRuntimeException { long rlen = mc.getRows(); long clen = mc.getCols(); int brlen = mc.getRowsPerBlock(); int bclen = mc.getColsPerBlock(); // Very expensive operation here: groupByKey (where number of keys might be too large) JavaRDD<Row> rowsRDD = binaryBlockRDD.flatMapToPair(new ProjectRows(rlen, clen, brlen, bclen)).groupByKey() .map(new ConvertDoubleArrayToRows(clen, bclen, true)); int numColumns = (int) clen; if (numColumns <= 0) { throw new DMLRuntimeException( "Output dimensions unknown after executing the script and hence cannot create the dataframe"); }/* w w w . j a v a 2s. c om*/ List<StructField> fields = new ArrayList<StructField>(); // LongTypes throw an error: java.lang.Double incompatible with java.lang.Long fields.add(DataTypes.createStructField("ID", DataTypes.DoubleType, false)); fields.add(DataTypes.createStructField("C1", new VectorUDT(), false)); // fields.add(DataTypes.createStructField("C1", DataTypes.createArrayType(DataTypes.DoubleType), false)); // This will cause infinite recursion due to bug in Spark // https://issues.apache.org/jira/browse/SPARK-6999 // return sqlContext.createDataFrame(rowsRDD, colNames); // where ArrayList<String> colNames return sqlContext.createDataFrame(rowsRDD.rdd(), DataTypes.createStructType(fields)); }
From source file:com.ibm.bi.dml.runtime.instructions.spark.utils.RDDConverterUtilsExt.java
License:Open Source License
public static DataFrame binaryBlockToDataFrame(JavaPairRDD<MatrixIndexes, MatrixBlock> binaryBlockRDD, MatrixCharacteristics mc, SQLContext sqlContext) throws DMLRuntimeException { long rlen = mc.getRows(); long clen = mc.getCols(); int brlen = mc.getRowsPerBlock(); int bclen = mc.getColsPerBlock(); // Very expensive operation here: groupByKey (where number of keys might be too large) JavaRDD<Row> rowsRDD = binaryBlockRDD.flatMapToPair(new ProjectRows(rlen, clen, brlen, bclen)).groupByKey() .map(new ConvertDoubleArrayToRows(clen, bclen, false)); int numColumns = (int) clen; if (numColumns <= 0) { // numColumns = rowsRDD.first().length() - 1; // Ugly, so instead prefer to throw throw new DMLRuntimeException( "Output dimensions unknown after executing the script and hence cannot create the dataframe"); }//from ww w . j a va 2 s . co m List<StructField> fields = new ArrayList<StructField>(); // LongTypes throw an error: java.lang.Double incompatible with java.lang.Long fields.add(DataTypes.createStructField("ID", DataTypes.DoubleType, false)); for (int i = 1; i <= numColumns; i++) { fields.add(DataTypes.createStructField("C" + i, DataTypes.DoubleType, false)); } // This will cause infinite recursion due to bug in Spark // https://issues.apache.org/jira/browse/SPARK-6999 // return sqlContext.createDataFrame(rowsRDD, colNames); // where ArrayList<String> colNames return sqlContext.createDataFrame(rowsRDD.rdd(), DataTypes.createStructType(fields)); }