Example usage for org.apache.spark.sql.types DataTypes createStructField

List of usage examples for org.apache.spark.sql.types DataTypes createStructField

Introduction

In this page you can find the example usage for org.apache.spark.sql.types DataTypes createStructField.

Prototype

public static StructField createStructField(String name, DataType dataType, boolean nullable) 

Source Link

Document

Creates a StructField with empty metadata.

Usage

From source file:KafkaSparkMongo.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (args.length < 2) {
        System.err.println("Usage: JavaDirectKafkaWordCount <brokers> <topics>\n"
                + "  <brokers> is a list of one or more Kafka brokers\n"
                + "  <topics> is a list of one or more kafka topics to consume from\n\n");
        System.exit(1);//from w ww.ja v a 2s. co m
    }

    String brokers = args[0];
    String topics = args[1];

    String UriMongo = "mongodb://localhost/streamSparkFinal.coll";
    dropDatabase(UriMongo);

    // Create the context with a 1 second batch size
    SparkConf sparkConf = new SparkConf().setAppName("JavaNetworkWordCount")
            .set("spark.app.id", "MongoSparkConnectorTour").set("spark.mongodb.input.uri", UriMongo)
            .set("spark.mongodb.output.uri", UriMongo);

    JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, Durations.seconds(5));
    /** Create a JavaReceiverInputDStream on target ip:port and count the
     * words in input stream of \n delimited text (eg. generated by 'nc')
     * Note that no duplication in storage level only for running locally.
     * Replication necessary in distributed scenario for fault tolerance.
     */

    Set<String> topicsSet = new HashSet<>(Arrays.asList(topics.split(",")));
    Map<String, String> kafkaParams = new HashMap<>();
    kafkaParams.put("metadata.broker.list", brokers);

    // Create direct kafka stream with brokers and topics
    JavaPairInputDStream<String, String> messages = KafkaUtils.createDirectStream(ssc, String.class,
            String.class, StringDecoder.class, StringDecoder.class, kafkaParams, topicsSet);

    messages.print();

    JavaDStream<String> lines = messages.map(x -> x._2());

    JavaDStream<Tuple7<String, String, String, String, String, String, String>> words = lines.map(y -> {
        String[] wordy = SPACE.split(y);
        return new Tuple7<>(wordy[0], wordy[1], wordy[2], wordy[3], wordy[4], wordy[5], wordy[6]);
    });

    words.foreachRDD(rdd -> {

        List<StructField> subFields = new ArrayList<>();
        subFields.add(DataTypes.createStructField("X", DataTypes.DoubleType, true));
        subFields.add(DataTypes.createStructField("Y", DataTypes.DoubleType, true));
        subFields.add(DataTypes.createStructField("z", DataTypes.DoubleType, true));

        List<StructField> fields = new ArrayList<>();
        fields.add(DataTypes.createStructField("Serial", DataTypes.StringType, true));
        fields.add(DataTypes.createStructField("Zone", DataTypes.StringType, true));
        fields.add(DataTypes.createStructField("Group", DataTypes.StringType, true));
        fields.add(DataTypes.createStructField("coord", DataTypes.createStructType(subFields), true));
        fields.add(DataTypes.createStructField("Time", DataTypes.TimestampType, true));

        StructType schema = DataTypes.createStructType(fields);

        SparkSession spark = JavaSparkSessionSingleton.getInstance(rdd.context().getConf());

        JavaRDD<Row> rowRDD = rdd
                .map(palabra -> RowFactory.create(palabra._1(), palabra._2(), palabra._3(),
                        RowFactory.create(Double.parseDouble(palabra._4()), Double.parseDouble(palabra._5()),
                                Double.parseDouble(palabra._6())),
                        Timestamp.from(Instant.parse(palabra._7()))));

        Dataset<Row> wordsDataFrame = spark.createDataFrame(rowRDD, schema);
        wordsDataFrame.show();
        MongoSpark.write(wordsDataFrame).option("collection", "pruebaF").mode("append").save();
    });

    ssc.start();
    ssc.awaitTermination();
}

From source file:com.alpine.plugin.samples.ver1_0.JavaCountPlugin.CountPluginSparkJob.java

License:Open Source License

public DataFrame transform(OperatorParameters params, DataFrame inputDataFrame, SparkRuntimeUtils sparkUtils,
        OperatorListener listener) {//from   w  w  w. j  a v a 2 s . co  m
    String groupByVar = params.getTabularDatasetSelectedColumn(GroupByParamKey)._2();
    listener.notifyMessage("Starting the DataFrame Transformation");
    DataFrame selectedData = inputDataFrame.select(groupByVar);
    DataFrame df = selectedData.groupBy(groupByVar).count();

    //customize the output schema
    List<StructField> fields = new ArrayList<StructField>();
    fields.add(DataTypes.createStructField(groupByVar, DataTypes.StringType, true));
    fields.add(DataTypes.createStructField("GroupCount", DataTypes.LongType, true));
    StructType dfSchema = DataTypes.createStructType(fields);
    return inputDataFrame.sqlContext().createDataFrame(df.rdd(), dfSchema);
}

From source file:com.andado.spark.examples.ml.JavaElementwiseProductExample.java

License:Apache License

public static void main(String[] args) {
    SparkSession spark = SparkSession.builder().appName("JavaElementwiseProductExample").getOrCreate();

    // $example on$
    // Create some vector data; also works for sparse vectors
    List<Row> data = Arrays.asList(RowFactory.create("a", Vectors.dense(1.0, 2.0, 3.0)),
            RowFactory.create("b", Vectors.dense(4.0, 5.0, 6.0)));

    List<StructField> fields = new ArrayList<>(2);
    fields.add(DataTypes.createStructField("id", DataTypes.StringType, false));
    fields.add(DataTypes.createStructField("vector", new VectorUDT(), false));

    StructType schema = DataTypes.createStructType(fields);

    Dataset<Row> dataFrame = spark.createDataFrame(data, schema);

    Vector transformingVector = Vectors.dense(0.0, 1.0, 2.0);

    ElementwiseProduct transformer = new ElementwiseProduct().setScalingVec(transformingVector)
            .setInputCol("vector").setOutputCol("transformedVector");

    // Batch transform the vectors to create new column:
    transformer.transform(dataFrame).show();
    // $example off$
    spark.stop();/*from   www .  j av  a2 s. c  om*/
}

From source file:com.andado.spark.examples.sql.JavaSparkSQLExample.java

License:Apache License

private static void runProgrammaticSchemaExample(SparkSession spark) {
    // $example on:programmatic_schema$
    // Create an RDD
    JavaRDD<String> peopleRDD = spark.sparkContext().textFile("examples/src/main/resources/people.txt", 1)
            .toJavaRDD();/*from ww w .j av a 2s  .co m*/

    // The schema is encoded in a string
    String schemaString = "name age";

    // Generate the schema based on the string of schema
    List<StructField> fields = new ArrayList<>();
    for (String fieldName : schemaString.split(" ")) {
        StructField field = DataTypes.createStructField(fieldName, DataTypes.StringType, true);
        fields.add(field);
    }
    StructType schema = DataTypes.createStructType(fields);

    // Convert records of the RDD (people) to Rows
    JavaRDD<Row> rowRDD = peopleRDD.map(new Function<String, Row>() {
        @Override
        public Row call(String record) throws Exception {
            String[] attributes = record.split(",");
            return RowFactory.create(attributes[0], attributes[1].trim());
        }
    });

    // Apply the schema to the RDD
    Dataset<Row> peopleDataFrame = spark.createDataFrame(rowRDD, schema);

    // Creates a temporary view using the DataFrame
    peopleDataFrame.createOrReplaceTempView("people");

    // SQL can be run over a temporary view created using DataFrames
    Dataset<Row> results = spark.sql("SELECT name FROM people");

    // The results of SQL queries are DataFrames and support all the normal RDD operations
    // The columns of a row in the result can be accessed by field index or by field name
    Dataset<String> namesDS = results.map(new MapFunction<Row, String>() {
        @Override
        public String call(Row row) throws Exception {
            return "Name: " + row.getString(0);
        }
    }, Encoders.STRING());
    namesDS.show();
    // +-------------+
    // |        value|
    // +-------------+
    // |Name: Michael|
    // |   Name: Andy|
    // | Name: Justin|
    // +-------------+
    // $example off:programmatic_schema$
}

From source file:com.cambitc.spark.streaming.KafkaDirectStreamGrouping.java

License:Apache License

public static void main(String[] args) {
    if (args.length < 2) {
        System.err.println("Usage: KafkaDirectStream <brokers> <topics>\n"
                + "  <brokers> is a list of one or more Kafka brokers\n"
                + "  <topics> is a list of one or more kafka topics to consume from\n\n"
                + " KafkaDirectStream localhost:9092 OBDTopics");
        System.exit(1);//from w ww .  ja  v a 2 s .com
    }

    //StreamingExamples.setStreamingLogLevels();

    String brokers = args[0];
    String topics = args[1];

    // Create context with a 2 seconds batch interval
    //SparkConf sparkConf = new SparkConf().setAppName("JavaDirectKafkaWordCount");
    JavaSparkContext sparkConf = new JavaSparkContext("local[5]", "JavaDirectKafkaWordCount");
    JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, Durations.seconds(10));

    SQLContext sqlContext = new SQLContext(sparkConf);

    HashSet<String> topicsSet = new HashSet<String>(Arrays.asList(topics.split(",")));
    HashMap<String, String> kafkaParams = new HashMap<String, String>();
    kafkaParams.put("metadata.broker.list", brokers);
    kafkaParams.put("zookeeper.connect", "localhost:2181");
    kafkaParams.put("group.id", "spark-app");
    System.out.println("Kafka parameters: " + kafkaParams);

    // Create direct kafka stream with brokers and topics
    JavaPairInputDStream<String, String> messages = KafkaUtils.createDirectStream(jssc, String.class,
            String.class, StringDecoder.class, StringDecoder.class, kafkaParams, topicsSet);

    // Generate the schema based on the string of schema
    List<StructField> fields = new ArrayList<StructField>();
    fields.add(DataTypes.createStructField("auctionid", DataTypes.StringType, true));
    fields.add(DataTypes.createStructField("bid", DataTypes.FloatType, true));
    fields.add(DataTypes.createStructField("bidtime", DataTypes.FloatType, true));
    fields.add(DataTypes.createStructField("bidder", DataTypes.StringType, true));
    fields.add(DataTypes.createStructField("bidderrate", DataTypes.IntegerType, true));
    fields.add(DataTypes.createStructField("openbid", DataTypes.FloatType, true));
    fields.add(DataTypes.createStructField("price", DataTypes.FloatType, true));
    fields.add(DataTypes.createStructField("item", DataTypes.StringType, true));
    fields.add(DataTypes.createStructField("daystolive", DataTypes.IntegerType, true));

    StructType schema = DataTypes.createStructType(fields);

    // Get the lines, split them into words
    JavaDStream<String> lines = messages.map(new Function<Tuple2<String, String>, String>() {
        @Override
        public String call(Tuple2<String, String> tuple2) {
            System.out.println("*************MY OUTPUT: processing lines: tuple2._1() = " + tuple2._1()
                    + "; tuple2._2()=" + tuple2._2());
            return tuple2._2();
        }
    });
    lines.print();

    //Creating Data Frame
    DataFrame dFrame = sqlContext.createDataFrame(lines, schema);

    JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() {
        @Override
        public Iterable<String> call(String x) {
            return Arrays.asList(SPACE.split(x));
        }
    });
    //words.print();

    // Reduce function adding two integers, defined separately for clarity
    Function2<Integer, Integer, Integer> reduceFunc = new Function2<Integer, Integer, Integer>() {
        @Override
        public Integer call(Integer i1, Integer i2) {
            return i1 + i2;
        }
    };

    // Count each word in each batch
    JavaPairDStream<String, Integer> pairs = words.mapToPair(new PairFunction<String, String, Integer>() {
        @Override
        public Tuple2<String, Integer> call(String s) {
            return new Tuple2<String, Integer>(s, 1);
        }
    });
    /*
    JavaPairDStream<String, Integer> wordCounts = pairs.reduceByKey(
      new Function2<Integer, Integer, Integer>() {
        @Override public Integer call(Integer i1, Integer i2) {
          return i1 + i2;
        }
      });
              
      /*  */
    // Reduce last 30 seconds of data, every 10 seconds
    JavaPairDStream<String, Integer> windowedWordCounts = pairs.reduceByKeyAndWindow(reduceFunc,
            Durations.seconds(30), Durations.seconds(10));

    windowedWordCounts.print();

    // Start the computation
    jssc.start();
    jssc.awaitTermination();
}

From source file:com.estonteco.spark.frames.conf.factory.serializers.StructTypeJAXBAdapter.java

@Override
public StructType unmarshal(ListColumns list) throws Exception {
    List<StructField> structFields = new ArrayList<StructField>();
    for (ColumnInfo field : list.getColumns()) {
        DataType dt = DataType.fromCaseClassString(field.getType());
        StructField columnType = DataTypes.createStructField(field.getName(), dt, field.isNullable());
        structFields.add(columnType);//from  w w w  .j  a va 2s. com
    }
    return DataTypes.createStructType(structFields);
}

From source file:com.getcake.sparkjdbc.SparkJDBCServer.java

License:Apache License

private String loadSingleFileWithMeta(String registerTableName, String fullPathTableName, String metaFileName)
        throws IOException {
    DataFrame dynamicDataFrame;//from   w w  w.  ja  v  a 2s. c  o m
    long startTime, firstStartTime;
    float durSeconds, durMinutes;
    String respMsg;

    startTime = System.currentTimeMillis();
    firstStartTime = startTime;
    try {
        dynamicDataFrame = hiveContext.table(registerTableName);
        respMsg = "table " + registerTableName + " at " + fullPathTableName + " was already loaded";
        log(respMsg);
        return respMsg;
    } catch (Throwable exc) {
        // hiveContext.table does not declare that it throws NoSuchTableException, so cannot use it in catch clause and
        // have to check for it explicitly
        if (exc instanceof NoSuchTableException) {
            respMsg = "table " + registerTableName + " at " + fullPathTableName
                    + " was not loaded => load it next";
            log(respMsg);
        } else {
            throw exc;
        }
    }

    FileInputStream propFileInputStream;
    propFileInputStream = new FileInputStream(metaFileName);
    properties = new Properties();
    properties.load(propFileInputStream);

    Stream<Entry<Object, Object>> stream = properties.entrySet().stream();
    Map<String, String> options = stream.collect(Collectors.toMap(entry -> String.valueOf(entry.getKey()),
            entry -> String.valueOf(entry.getValue())));

    int numColumns = Integer.parseInt(properties.getProperty("numColumns"));
    StructField structFields[] = new StructField[numColumns];
    String colName, colType;
    StructField structField;

    for (int i = 1; i <= numColumns; i++) {
        colName = properties.getProperty("col" + i + ".name");
        colType = properties.getProperty("col" + i + ".type");
        switch (colType) {
        case "TimeStamp":
            structField = DataTypes.createStructField(colName, DataTypes.TimestampType, true);
            break;

        case "Date":
            structField = DataTypes.createStructField(colName, DataTypes.DateType, true);
            break;

        case "Float":
            structField = DataTypes.createStructField(colName, DataTypes.FloatType, true);
            break;

        case "Integer":
            structField = DataTypes.createStructField(colName, DataTypes.IntegerType, true);
            break;

        case "Long":
            structField = DataTypes.createStructField(colName, DataTypes.LongType, true);
            break;

        case "Short":
            structField = DataTypes.createStructField(colName, DataTypes.ShortType, true);
            break;

        case "Double":
            structField = DataTypes.createStructField(colName, DataTypes.DoubleType, true);
            break;

        case "Boolean":
            structField = DataTypes.createStructField(colName, DataTypes.BooleanType, true);
            break;

        case "Binary":
            structField = DataTypes.createStructField(colName, DataTypes.BinaryType, true);
            break;

        case "Byte":
            structField = DataTypes.createStructField(colName, DataTypes.ByteType, true);
            break;

        case "Null":
            structField = DataTypes.createStructField(colName, DataTypes.NullType, true);
            break;

        default:
            structField = DataTypes.createStructField(colName, DataTypes.StringType, true);
        }

        structFields[i - 1] = structField;
    }

    // dynamicDataFrame = hiveContext.read().format("com.databricks.spark.csv").
    //   option("header", Boolean.toString(headerInCSVFileFlag)).option("inferSchema", Boolean.toString(inferSchemaFlag)).load(fullPathTableName);
    // Map<String, String> options = new HashMap<>(properties);
    options.put("path", "file:///" + fullPathTableName);
    // options.put("header", "false");
    // options.put("delimiter", ",");

    // DataType dataType = new DataType ();
    /*
    StructField structField1 = DataTypes.createStructField("LogType", DataTypes.StringType, false);
    StructField structField2 = DataTypes.createStructField("EntryTime", DataTypes.TimestampType, false);
    StructField structField3 = DataTypes.createStructField("Code_Class", DataTypes.StringType, false);
    StructField structField4 = DataTypes.createStructField("Code_Method", DataTypes.StringType, false);
    StructField structField5 = DataTypes.createStructField("Log_Message", DataTypes.StringType, false);
    structFields[0] = structField1;
    structFields[1] = structField2;
    structFields[2] = structField3;
    structFields[3] = structField4;
    structFields[4] = structField5;
    */

    StructType schema = new StructType(structFields);

    dynamicDataFrame = hiveContext.load("com.databricks.spark.csv", schema, options);

    durSeconds = (float) (System.currentTimeMillis() - startTime) / 1000F;
    durMinutes = durSeconds / 60F;
    log("loaded table " + fullPathTableName + " in seconds: " + durSeconds + " / in minutes: " + durMinutes);

    schema = dynamicDataFrame.schema();
    structFields = schema.fields();
    for (StructField structFieldLocal : structFields) {
        DataType dataType = structFieldLocal.dataType();
        logger.debug(structFieldLocal.name() + " - dataType: " + dataType.typeName());
    }

    startTime = System.currentTimeMillis();
    dynamicDataFrame.cache();
    durSeconds = (float) (System.currentTimeMillis() - startTime) / 1000F;
    durMinutes = durSeconds / 60F;
    log("cache table " + fullPathTableName + " in seconds: " + durSeconds + " / in minutes: " + durMinutes);

    startTime = System.currentTimeMillis();
    dynamicDataFrame.registerTempTable(registerTableName);

    durSeconds = (float) (System.currentTimeMillis() - startTime) / 1000F;
    durMinutes = durSeconds / 60F;
    log("registerTempTable table " + registerTableName + " in seconds: " + durSeconds + " / in minutes: "
            + durMinutes);

    durSeconds = (float) (System.currentTimeMillis() - firstStartTime) / 1000F;
    durMinutes = durSeconds / 60F;
    respMsg = "Completed loading table " + fullPathTableName + " in seconds: " + durSeconds + " / in minutes: "
            + durMinutes;
    log(respMsg);
    return respMsg;
}

From source file:com.getcake.sparkjdbc.SparkJDBCServer.java

License:Apache License

private String loadFilesWithMeta(String registerTableName, String fullPathTableName, String metaFileName,
        String fileListName) throws IOException {
    DataFrame combinedDynamicDataFrame = null, dynamicDataFrame = null;
    long startTime, firstStartTime;
    float durSeconds, durMinutes;
    String respMsg;/*from  ww w .ja v a 2 s.co m*/

    startTime = System.currentTimeMillis();
    firstStartTime = startTime;
    try {
        combinedDynamicDataFrame = hiveContext.table(registerTableName);
        respMsg = "table " + registerTableName + " at " + fullPathTableName + " was already loaded";
        log(respMsg);
        return respMsg;
    } catch (Throwable exc) {
        // hiveContext.table does not declare that it throws NoSuchTableException, so cannot use it in catch clause and
        // have to check for it explicitly
        if (exc instanceof NoSuchTableException) {
            respMsg = "table " + registerTableName + " at " + fullPathTableName
                    + " was not loaded => load it next";
            log(respMsg);
        } else {
            throw exc;
        }
    }

    FileInputStream propFileInputStream;
    propFileInputStream = new FileInputStream(metaFileName);
    properties = new Properties();
    properties.load(propFileInputStream);

    Stream<Entry<Object, Object>> stream = properties.entrySet().stream();
    Map<String, String> options = stream.collect(Collectors.toMap(entry -> String.valueOf(entry.getKey()),
            entry -> String.valueOf(entry.getValue())));

    int numColumns = Integer.parseInt(properties.getProperty("numColumns"));
    StructField structFields[] = new StructField[numColumns];
    String colName, colType;
    StructField structField;

    // structField = DataTypes.createStructField("File_Source", DataTypes.StringType, true);
    // structFields[0] = structField;

    for (int i = 1; i <= numColumns; i++) {
        colName = properties.getProperty("col" + i + ".name");
        colType = properties.getProperty("col" + i + ".type");
        switch (colType) {
        case "TimeStamp":
            structField = DataTypes.createStructField(colName, DataTypes.TimestampType, true);
            break;

        case "Date":
            structField = DataTypes.createStructField(colName, DataTypes.DateType, true);
            break;

        case "Float":
            structField = DataTypes.createStructField(colName, DataTypes.FloatType, true);
            break;

        case "Integer":
            structField = DataTypes.createStructField(colName, DataTypes.IntegerType, true);
            break;

        case "Long":
            structField = DataTypes.createStructField(colName, DataTypes.LongType, true);
            break;

        case "Short":
            structField = DataTypes.createStructField(colName, DataTypes.ShortType, true);
            break;

        case "Double":
            structField = DataTypes.createStructField(colName, DataTypes.DoubleType, true);
            break;

        case "Boolean":
            structField = DataTypes.createStructField(colName, DataTypes.BooleanType, true);
            break;

        case "Binary":
            structField = DataTypes.createStructField(colName, DataTypes.BinaryType, true);
            break;

        case "Byte":
            structField = DataTypes.createStructField(colName, DataTypes.ByteType, true);
            break;

        case "Null":
            structField = DataTypes.createStructField(colName, DataTypes.NullType, true);
            break;

        default:
            structField = DataTypes.createStructField(colName, DataTypes.StringType, true);
        }

        structFields[i - 1] = structField;
    }

    StructType schema = new StructType(structFields);

    List<String> fileLlist = new ArrayList<>();
    try (BufferedReader br = Files.newBufferedReader(Paths.get(fileListName))) {

        //br returns as stream and convert it into a List
        fileLlist = br.lines().collect(Collectors.toList());

    } catch (IOException e) {
        e.printStackTrace();
    }

    for (String file : fileLlist) {
        options.put("path", "file:///" + file);
        dynamicDataFrame = hiveContext.load("com.databricks.spark.csv", schema, options);
        if (combinedDynamicDataFrame == null) {
            combinedDynamicDataFrame = dynamicDataFrame;
        } else {
            combinedDynamicDataFrame = combinedDynamicDataFrame.unionAll(dynamicDataFrame);
        }
    }

    durSeconds = (float) (System.currentTimeMillis() - startTime) / 1000F;
    durMinutes = durSeconds / 60F;
    log("loaded table " + fullPathTableName + " in seconds: " + durSeconds + " / in minutes: " + durMinutes);

    schema = combinedDynamicDataFrame.schema();
    structFields = schema.fields();
    for (StructField structFieldLocal : structFields) {
        DataType dataType = structFieldLocal.dataType();
        logger.debug(structFieldLocal.name() + " - dataType: " + dataType.typeName());
    }

    startTime = System.currentTimeMillis();
    combinedDynamicDataFrame.cache();
    durSeconds = (float) (System.currentTimeMillis() - startTime) / 1000F;
    durMinutes = durSeconds / 60F;
    log("cache table " + fullPathTableName + " in seconds: " + durSeconds + " / in minutes: " + durMinutes);

    startTime = System.currentTimeMillis();
    combinedDynamicDataFrame.registerTempTable(registerTableName);

    durSeconds = (float) (System.currentTimeMillis() - startTime) / 1000F;
    durMinutes = durSeconds / 60F;
    log("registerTempTable table " + registerTableName + " in seconds: " + durSeconds + " / in minutes: "
            + durMinutes);

    durSeconds = (float) (System.currentTimeMillis() - firstStartTime) / 1000F;
    durMinutes = durSeconds / 60F;
    respMsg = "Completed loading table " + fullPathTableName + " in seconds: " + durSeconds + " / in minutes: "
            + durMinutes;
    log(respMsg);
    return respMsg;
}

From source file:com.hxr.bigdata.spark.example141.JavaSparkSQL.java

License:Apache License

public static void main(final String[] args) throws Exception {
    SparkConf sparkConf = new SparkConf().setAppName("JavaSparkSQL");
    JavaSparkContext ctx = new JavaSparkContext(sparkConf);
    SQLContext sqlContext = new SQLContext(ctx);

    System.out.println("=== Data source: RDD ===");
    // Load a text file and convert each line to a Java Bean.
    // ?javabean?
    //         hdfs://127.0.0.1:9000/spark/people.txt
    JavaRDD<Person> people = ctx.textFile("/spark/people.txt").map(new Function<String, Person>() {

        public Person call(final String line) {
            String[] parts = line.split(",");

            Person person = new Person();
            person.setName(parts[0]);/*  w  w w  .j a v  a2s  .  co  m*/
            person.setAge(Integer.parseInt(parts[1].trim()));

            return person;
        }
    });

    // Apply a schema to an RDD of Java Beans and register it as a table.
    // schema?javabeanRDD
    DataFrame schemaPeople = sqlContext.createDataFrame(people, Person.class);
    schemaPeople.registerTempTable("people");

    // SQL can be run over RDDs that have been registered as tables.
    // ??sql
    DataFrame teenagers = sqlContext.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19");

    // The results of SQL queries are DataFrames and support all the normal RDD operations.
    // The columns of a row in the result can be accessed by ordinal.
    //        DataFrame?RDD?RDD?
    List<String> teenagerNames = teenagers.toJavaRDD().map(new Function<Row, String>() {

        public String call(final Row row) {
            return "Name: " + row.getString(0);
        }
    }).collect();
    for (String name : teenagerNames) {
        System.out.println(name);
    }
    //        ------------------------??javabean--------------------------
    // The schema is encoded in a string
    String schemaString = "name age";

    // Generate the schema based on the string of schema
    List<StructField> fields = new ArrayList<StructField>();
    for (String fieldName : schemaString.split(" ")) {
        fields.add(DataTypes.createStructField(fieldName, DataTypes.StringType, true));
    }
    StructType schema = DataTypes.createStructType(fields);

    // Load a text file and convert each line to a JavaBean.
    JavaRDD<String> peopleT = ctx.textFile("/spark/people.txt");

    // Convert records of the RDD (people) to Rows.
    JavaRDD<Row> rowRDD = peopleT.map(new Function<String, Row>() {
        public Row call(final String record) throws Exception {
            String[] fields = record.split(",");
            return RowFactory.create(fields[0], fields[1].trim());
        }
    });

    // Apply the schema to the RDD.
    DataFrame peopleDataFrame = sqlContext.createDataFrame(rowRDD, schema);

    // Register the DataFrame as a table.
    peopleDataFrame.registerTempTable("people");

    // SQL can be run over RDDs that have been registered as tables.
    DataFrame results = sqlContext.sql("SELECT name FROM people");

    // The results of SQL queries are DataFrames and support all the normal RDD operations.
    // The columns of a row in the result can be accessed by ordinal.
    List<String> names = results.javaRDD().map(new Function<Row, String>() {
        public String call(final Row row) {
            return "Name: " + row.getString(0);
        }
    }).collect();

    System.out.println("=== Data source: Parquet File ===");
    // DataFrames can be saved as parquet files, maintaining the schema information.
    //        hdfs??hdfs://127.0.0.1:9000/user/hanxirui/people.parquet
    //        SaveMode.ErrorIfExists (default)  When saving a DataFrame to a data source, if data already exists, an exception is expected to be thrown.
    //        SaveMode.Append When saving a DataFrame to a data source, if data/table already exists, contents of the DataFrame are expected to be appended to existing data.
    //        SaveMode.Overwrite  Overwrite mode means that when saving a DataFrame to a data source, if data/table already exists, existing data is expected to be overwritten by the contents of the DataFrame.
    //        SaveMode.Ignore  Ignore mode means that when saving a DataFrame to a data source, if data already exists, the save operation is expected to not save the contents of the DataFrame and to not change the existing data. This is similar to a CREATE TABLE IF NOT EXISTS in SQL.
    schemaPeople.write().mode(SaveMode.Ignore).parquet("people.parquet");

    // Read in the parquet file created above.
    // Parquet files are self-describing so the schema is preserved.
    // The result of loading a parquet file is also a DataFrame.
    DataFrame parquetFile = sqlContext.read().parquet("people.parquet");

    // Parquet files can also be registered as tables and then used in SQL statements.
    parquetFile.registerTempTable("parquetFile");
    DataFrame teenagers2 = sqlContext.sql("SELECT name FROM parquetFile WHERE age >= 13 AND age <= 19");
    teenagerNames = teenagers2.toJavaRDD().map(new Function<Row, String>() {

        public String call(final Row row) {
            return "Name: " + row.getString(0);
        }
    }).collect();
    for (String name : teenagerNames) {
        System.out.println(name);
    }

    System.out.println("=== Data source: JSON Dataset ===");
    // A JSON dataset is pointed by path.
    // The path can be either a single text file or a directory storing text files.
    String path = "/spark/people.json";
    // Create a DataFrame from the file(s) pointed by path
    DataFrame peopleFromJsonFile = sqlContext.read().json(path);

    // Because the schema of a JSON dataset is automatically inferred, to write queries,
    // it is better to take a look at what is the schema.
    peopleFromJsonFile.printSchema();
    // The schema of people is ...
    // root
    // |-- age: IntegerType
    // |-- name: StringType

    // Register this DataFrame as a table.
    peopleFromJsonFile.registerTempTable("people");

    // SQL statements can be run by using the sql methods provided by sqlContext.
    DataFrame teenagers3 = sqlContext.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19");

    // The results of SQL queries are DataFrame and support all the normal RDD operations.
    // The columns of a row in the result can be accessed by ordinal.
    teenagerNames = teenagers3.toJavaRDD().map(new Function<Row, String>() {

        public String call(final Row row) {
            return "Name: " + row.getString(0);
        }
    }).collect();
    for (String name : teenagerNames) {
        System.out.println(name);
    }

    // Alternatively, a DataFrame can be created for a JSON dataset represented by
    // a RDD[String] storing one JSON object per string.
    List<String> jsonData = Arrays
            .asList("{\"name\":\"Yin\",\"address\":{\"city\":\"Columbus\",\"state\":\"Ohio\"}}");
    JavaRDD<String> anotherPeopleRDD = ctx.parallelize(jsonData);
    DataFrame peopleFromJsonRDD = sqlContext.read().json(anotherPeopleRDD.rdd());

    // Take a look at the schema of this new DataFrame.
    peopleFromJsonRDD.printSchema();
    // The schema of anotherPeople is ...
    // root
    // |-- address: StructType
    // | |-- city: StringType
    // | |-- state: StringType
    // |-- name: StringType

    peopleFromJsonRDD.registerTempTable("people2");

    DataFrame peopleWithCity = sqlContext.sql("SELECT name, address.city FROM people2");
    List<String> nameAndCity = peopleWithCity.toJavaRDD().map(new Function<Row, String>() {

        public String call(final Row row) {
            return "Name: " + row.getString(0) + ", City: " + row.getString(1);
        }
    }).collect();
    for (String name : nameAndCity) {
        System.out.println(name);
    }

    ctx.stop();
}

From source file:com.ibm.bi.dml.api.MLOutput.java

License:Open Source License

/**
 * This methods improves the performance of MLPipeline wrappers.
 * @param sqlContext//from  w ww. j  a v a  2s. c o m
 * @param varName
 * @param range range is inclusive
 * @return
 * @throws DMLRuntimeException
 */
public DataFrame getDF(SQLContext sqlContext, String varName, HashMap<String, Tuple2<Long, Long>> range)
        throws DMLRuntimeException {
    JavaPairRDD<MatrixIndexes, MatrixBlock> binaryBlockRDD = getBinaryBlockedRDD(varName);
    if (binaryBlockRDD == null) {
        throw new DMLRuntimeException("Variable " + varName + " not found in the output symbol table.");
    }
    MatrixCharacteristics mc = _outMetadata.get(varName);
    long rlen = mc.getRows();
    long clen = mc.getCols();
    int brlen = mc.getRowsPerBlock();
    int bclen = mc.getColsPerBlock();

    ArrayList<Tuple2<String, Tuple2<Long, Long>>> alRange = new ArrayList<Tuple2<String, Tuple2<Long, Long>>>();
    for (Entry<String, Tuple2<Long, Long>> e : range.entrySet()) {
        alRange.add(new Tuple2<String, Tuple2<Long, Long>>(e.getKey(), e.getValue()));
    }

    // Very expensive operation here: groupByKey (where number of keys might be too large)
    JavaRDD<Row> rowsRDD = binaryBlockRDD.flatMapToPair(new ProjectRows(rlen, clen, brlen, bclen)).groupByKey()
            .map(new ConvertDoubleArrayToRangeRows(clen, bclen, alRange));

    int numColumns = (int) clen;
    if (numColumns <= 0) {
        throw new DMLRuntimeException(
                "Output dimensions unknown after executing the script and hence cannot create the dataframe");
    }

    List<StructField> fields = new ArrayList<StructField>();
    // LongTypes throw an error: java.lang.Double incompatible with java.lang.Long
    fields.add(DataTypes.createStructField("ID", DataTypes.DoubleType, false));
    for (int k = 0; k < alRange.size(); k++) {
        String colName = alRange.get(k)._1;
        long low = alRange.get(k)._2._1;
        long high = alRange.get(k)._2._2;
        if (low != high)
            fields.add(DataTypes.createStructField(colName, new VectorUDT(), false));
        else
            fields.add(DataTypes.createStructField(colName, DataTypes.DoubleType, false));
    }

    // This will cause infinite recursion due to bug in Spark
    // https://issues.apache.org/jira/browse/SPARK-6999
    // return sqlContext.createDataFrame(rowsRDD, colNames); // where ArrayList<String> colNames
    return sqlContext.createDataFrame(rowsRDD.rdd(), DataTypes.createStructType(fields));

}