Example usage for org.apache.spark.sql.types DataTypes DoubleType

List of usage examples for org.apache.spark.sql.types DataTypes DoubleType

Introduction

In this page you can find the example usage for org.apache.spark.sql.types DataTypes DoubleType.

Prototype

DataType DoubleType

To view the source code for org.apache.spark.sql.types DataTypes DoubleType.

Click Source Link

Document

Gets the DoubleType object.

Usage

From source file:KafkaSparkMongo.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (args.length < 2) {
        System.err.println("Usage: JavaDirectKafkaWordCount <brokers> <topics>\n"
                + "  <brokers> is a list of one or more Kafka brokers\n"
                + "  <topics> is a list of one or more kafka topics to consume from\n\n");
        System.exit(1);/*from  ww w  .  j  a  va2 s  . com*/
    }

    String brokers = args[0];
    String topics = args[1];

    String UriMongo = "mongodb://localhost/streamSparkFinal.coll";
    dropDatabase(UriMongo);

    // Create the context with a 1 second batch size
    SparkConf sparkConf = new SparkConf().setAppName("JavaNetworkWordCount")
            .set("spark.app.id", "MongoSparkConnectorTour").set("spark.mongodb.input.uri", UriMongo)
            .set("spark.mongodb.output.uri", UriMongo);

    JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, Durations.seconds(5));
    /** Create a JavaReceiverInputDStream on target ip:port and count the
     * words in input stream of \n delimited text (eg. generated by 'nc')
     * Note that no duplication in storage level only for running locally.
     * Replication necessary in distributed scenario for fault tolerance.
     */

    Set<String> topicsSet = new HashSet<>(Arrays.asList(topics.split(",")));
    Map<String, String> kafkaParams = new HashMap<>();
    kafkaParams.put("metadata.broker.list", brokers);

    // Create direct kafka stream with brokers and topics
    JavaPairInputDStream<String, String> messages = KafkaUtils.createDirectStream(ssc, String.class,
            String.class, StringDecoder.class, StringDecoder.class, kafkaParams, topicsSet);

    messages.print();

    JavaDStream<String> lines = messages.map(x -> x._2());

    JavaDStream<Tuple7<String, String, String, String, String, String, String>> words = lines.map(y -> {
        String[] wordy = SPACE.split(y);
        return new Tuple7<>(wordy[0], wordy[1], wordy[2], wordy[3], wordy[4], wordy[5], wordy[6]);
    });

    words.foreachRDD(rdd -> {

        List<StructField> subFields = new ArrayList<>();
        subFields.add(DataTypes.createStructField("X", DataTypes.DoubleType, true));
        subFields.add(DataTypes.createStructField("Y", DataTypes.DoubleType, true));
        subFields.add(DataTypes.createStructField("z", DataTypes.DoubleType, true));

        List<StructField> fields = new ArrayList<>();
        fields.add(DataTypes.createStructField("Serial", DataTypes.StringType, true));
        fields.add(DataTypes.createStructField("Zone", DataTypes.StringType, true));
        fields.add(DataTypes.createStructField("Group", DataTypes.StringType, true));
        fields.add(DataTypes.createStructField("coord", DataTypes.createStructType(subFields), true));
        fields.add(DataTypes.createStructField("Time", DataTypes.TimestampType, true));

        StructType schema = DataTypes.createStructType(fields);

        SparkSession spark = JavaSparkSessionSingleton.getInstance(rdd.context().getConf());

        JavaRDD<Row> rowRDD = rdd
                .map(palabra -> RowFactory.create(palabra._1(), palabra._2(), palabra._3(),
                        RowFactory.create(Double.parseDouble(palabra._4()), Double.parseDouble(palabra._5()),
                                Double.parseDouble(palabra._6())),
                        Timestamp.from(Instant.parse(palabra._7()))));

        Dataset<Row> wordsDataFrame = spark.createDataFrame(rowRDD, schema);
        wordsDataFrame.show();
        MongoSpark.write(wordsDataFrame).option("collection", "pruebaF").mode("append").save();
    });

    ssc.start();
    ssc.awaitTermination();
}

From source file:com.andado.spark.examples.ml.JavaAFTSurvivalRegressionExample.java

License:Apache License

public static void main(String[] args) {
    SparkSession spark = SparkSession.builder().appName("JavaAFTSurvivalRegressionExample").getOrCreate();

    // $example on$
    List<Row> data = Arrays.asList(RowFactory.create(1.218, 1.0, Vectors.dense(1.560, -0.605)),
            RowFactory.create(2.949, 0.0, Vectors.dense(0.346, 2.158)),
            RowFactory.create(3.627, 0.0, Vectors.dense(1.380, 0.231)),
            RowFactory.create(0.273, 1.0, Vectors.dense(0.520, 1.151)),
            RowFactory.create(4.199, 0.0, Vectors.dense(0.795, -0.226)));
    StructType schema = new StructType(
            new StructField[] { new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
                    new StructField("censor", DataTypes.DoubleType, false, Metadata.empty()),
                    new StructField("features", new VectorUDT(), false, Metadata.empty()) });
    Dataset<Row> training = spark.createDataFrame(data, schema);
    double[] quantileProbabilities = new double[] { 0.3, 0.6 };
    AFTSurvivalRegression aft = new AFTSurvivalRegression().setQuantileProbabilities(quantileProbabilities)
            .setQuantilesCol("quantiles");

    AFTSurvivalRegressionModel model = aft.fit(training);

    // Print the coefficients, intercept and scale parameter for AFT survival regression
    System.out.println("Coefficients: " + model.coefficients());
    System.out.println("Intercept: " + model.intercept());
    System.out.println("Scale: " + model.scale());
    model.transform(training).show(false);
    // $example off$

    spark.stop();/*from   ww w.  j a va  2 s .co m*/
}

From source file:com.andado.spark.examples.ml.JavaBinarizerExample.java

License:Apache License

public static void main(String[] args) {
    SparkSession spark = SparkSession.builder().appName("JavaBinarizerExample").getOrCreate();

    // $example on$
    List<Row> data = Arrays.asList(RowFactory.create(0, 0.1), RowFactory.create(1, 0.8),
            RowFactory.create(2, 0.2));/*from w  ww .j  a  v  a  2s.  c  o  m*/
    StructType schema = new StructType(
            new StructField[] { new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
                    new StructField("feature", DataTypes.DoubleType, false, Metadata.empty()) });
    Dataset<Row> continuousDataFrame = spark.createDataFrame(data, schema);

    Binarizer binarizer = new Binarizer().setInputCol("feature").setOutputCol("binarized_feature")
            .setThreshold(0.5);

    Dataset<Row> binarizedDataFrame = binarizer.transform(continuousDataFrame);

    System.out.println("Binarizer output with Threshold = " + binarizer.getThreshold());
    binarizedDataFrame.show();
    // $example off$

    spark.stop();
}

From source file:com.andado.spark.examples.ml.JavaBucketizerExample.java

License:Apache License

public static void main(String[] args) {
    SparkSession spark = SparkSession.builder().appName("JavaBucketizerExample").getOrCreate();

    // $example on$
    double[] splits = { Double.NEGATIVE_INFINITY, -0.5, 0.0, 0.5, Double.POSITIVE_INFINITY };

    List<Row> data = Arrays.asList(RowFactory.create(-999.9), RowFactory.create(-0.5), RowFactory.create(-0.3),
            RowFactory.create(0.0), RowFactory.create(0.2), RowFactory.create(999.9));
    StructType schema = new StructType(
            new StructField[] { new StructField("features", DataTypes.DoubleType, false, Metadata.empty()) });
    Dataset<Row> dataFrame = spark.createDataFrame(data, schema);

    Bucketizer bucketizer = new Bucketizer().setInputCol("features").setOutputCol("bucketedFeatures")
            .setSplits(splits);//from  w  w w.  j  a va2 s  .  com

    // Transform original data into its bucket index.
    Dataset<Row> bucketedData = bucketizer.transform(dataFrame);

    System.out.println("Bucketizer output with " + (bucketizer.getSplits().length - 1) + " buckets");
    bucketedData.show();
    // $example off$

    spark.stop();
}

From source file:com.andado.spark.examples.ml.JavaChiSqSelectorExample.java

License:Apache License

public static void main(String[] args) {
    SparkSession spark = SparkSession.builder().appName("JavaChiSqSelectorExample").getOrCreate();

    // $example on$
    List<Row> data = Arrays.asList(RowFactory.create(7, Vectors.dense(0.0, 0.0, 18.0, 1.0), 1.0),
            RowFactory.create(8, Vectors.dense(0.0, 1.0, 12.0, 0.0), 0.0),
            RowFactory.create(9, Vectors.dense(1.0, 0.0, 15.0, 0.1), 0.0));
    StructType schema = new StructType(
            new StructField[] { new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
                    new StructField("features", new VectorUDT(), false, Metadata.empty()),
                    new StructField("clicked", DataTypes.DoubleType, false, Metadata.empty()) });

    Dataset<Row> df = spark.createDataFrame(data, schema);

    ChiSqSelector selector = new ChiSqSelector().setNumTopFeatures(1).setFeaturesCol("features")
            .setLabelCol("clicked").setOutputCol("selectedFeatures");

    Dataset<Row> result = selector.fit(df).transform(df);

    System.out.println("ChiSqSelector output with top " + selector.getNumTopFeatures() + " features selected");
    result.show();//w ww  . j a va2s  .  c  o m

    // $example off$
    spark.stop();
}

From source file:com.andado.spark.examples.ml.JavaEstimatorTransformerParamExample.java

License:Apache License

public static void main(String[] args) {
    SparkSession spark = SparkSession.builder().appName("JavaEstimatorTransformerParamExample").getOrCreate();

    // $example on$
    // Prepare training data.
    List<Row> dataTraining = Arrays.asList(RowFactory.create(1.0, Vectors.dense(0.0, 1.1, 0.1)),
            RowFactory.create(0.0, Vectors.dense(2.0, 1.0, -1.0)),
            RowFactory.create(0.0, Vectors.dense(2.0, 1.3, 1.0)),
            RowFactory.create(1.0, Vectors.dense(0.0, 1.2, -0.5)));
    StructType schema = new StructType(
            new StructField[] { new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
                    new StructField("features", new VectorUDT(), false, Metadata.empty()) });
    Dataset<Row> training = spark.createDataFrame(dataTraining, schema);

    // Create a LogisticRegression instance. This instance is an Estimator.
    LogisticRegression lr = new LogisticRegression();
    // Print out the parameters, documentation, and any default values.
    System.out.println("LogisticRegression parameters:\n" + lr.explainParams() + "\n");

    // We may set parameters using setter methods.
    lr.setMaxIter(10).setRegParam(0.01);

    // Learn a LogisticRegression model. This uses the parameters stored in lr.
    LogisticRegressionModel model1 = lr.fit(training);
    // Since model1 is a Model (i.e., a Transformer produced by an Estimator),
    // we can view the parameters it used during fit().
    // This prints the parameter (name: value) pairs, where names are unique IDs for this
    // LogisticRegression instance.
    System.out.println("Model 1 was fit using parameters: " + model1.parent().extractParamMap());

    // We may alternatively specify parameters using a ParamMap.
    ParamMap paramMap = new ParamMap().put(lr.maxIter().w(20)) // Specify 1 Param.
            .put(lr.maxIter(), 30) // This overwrites the original maxIter.
            .put(lr.regParam().w(0.1), lr.threshold().w(0.55)); // Specify multiple Params.

    // One can also combine ParamMaps.
    ParamMap paramMap2 = new ParamMap().put(lr.probabilityCol().w("myProbability")); // Change output column name
    ParamMap paramMapCombined = paramMap.$plus$plus(paramMap2);

    // Now learn a new model using the paramMapCombined parameters.
    // paramMapCombined overrides all parameters set earlier via lr.set* methods.
    LogisticRegressionModel model2 = lr.fit(training, paramMapCombined);
    System.out.println("Model 2 was fit using parameters: " + model2.parent().extractParamMap());

    // Prepare test documents.
    List<Row> dataTest = Arrays.asList(RowFactory.create(1.0, Vectors.dense(-1.0, 1.5, 1.3)),
            RowFactory.create(0.0, Vectors.dense(3.0, 2.0, -0.1)),
            RowFactory.create(1.0, Vectors.dense(0.0, 2.2, -1.5)));
    Dataset<Row> test = spark.createDataFrame(dataTest, schema);

    // Make predictions on test documents using the Transformer.transform() method.
    // LogisticRegression.transform will only use the 'features' column.
    // Note that model2.transform() outputs a 'myProbability' column instead of the usual
    // 'probability' column since we renamed the lr.probabilityCol parameter previously.
    Dataset<Row> results = model2.transform(test);
    Dataset<Row> rows = results.select("features", "label", "myProbability", "prediction");
    for (Row r : rows.collectAsList()) {
        System.out.println(/*from   w ww .  jav a 2 s  .c o  m*/
                "(" + r.get(0) + ", " + r.get(1) + ") -> prob=" + r.get(2) + ", prediction=" + r.get(3));
    }
    // $example off$

    spark.stop();
}

From source file:com.andado.spark.examples.ml.JavaQuantileDiscretizerExample.java

License:Apache License

public static void main(String[] args) {
    SparkSession spark = SparkSession.builder().appName("JavaQuantileDiscretizerExample").getOrCreate();

    // $example on$
    List<Row> data = Arrays.asList(RowFactory.create(0, 18.0), RowFactory.create(1, 19.0),
            RowFactory.create(2, 8.0), RowFactory.create(3, 5.0), RowFactory.create(4, 2.2));

    StructType schema = new StructType(
            new StructField[] { new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
                    new StructField("hour", DataTypes.DoubleType, false, Metadata.empty()) });

    Dataset<Row> df = spark.createDataFrame(data, schema);
    // $example off$
    // Output of QuantileDiscretizer for such small datasets can depend on the number of
    // partitions. Here we force a single partition to ensure consistent results.
    // Note this is not necessary for normal use cases
    df = df.repartition(1);//  w w w  . j a v  a2 s  .  c om
    // $example on$
    QuantileDiscretizer discretizer = new QuantileDiscretizer().setInputCol("hour").setOutputCol("result")
            .setNumBuckets(3);

    Dataset<Row> result = discretizer.fit(df).transform(df);
    result.show();
    // $example off$
    spark.stop();
}

From source file:com.andado.spark.examples.ml.JavaSQLTransformerExample.java

License:Apache License

public static void main(String[] args) {
    SparkSession spark = SparkSession.builder().appName("JavaSQLTransformerExample").getOrCreate();

    // $example on$
    List<Row> data = Arrays.asList(RowFactory.create(0, 1.0, 3.0), RowFactory.create(2, 2.0, 5.0));
    StructType schema = new StructType(
            new StructField[] { new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
                    new StructField("v1", DataTypes.DoubleType, false, Metadata.empty()),
                    new StructField("v2", DataTypes.DoubleType, false, Metadata.empty()) });
    Dataset<Row> df = spark.createDataFrame(data, schema);

    SQLTransformer sqlTrans = new SQLTransformer()
            .setStatement("SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__");

    sqlTrans.transform(df).show();/*from w w w  . j  a v a2s. c  o m*/
    // $example off$

    spark.stop();
}

From source file:com.andado.spark.examples.ml.JavaTfIdfExample.java

License:Apache License

public static void main(String[] args) {
    SparkSession spark = SparkSession.builder().appName("JavaTfIdfExample").getOrCreate();

    // $example on$
    List<Row> data = Arrays.asList(RowFactory.create(0.0, "Hi I heard about Spark"),
            RowFactory.create(0.0, "I wish Java could use case classes"),
            RowFactory.create(1.0, "Logistic regression models are neat"));
    StructType schema = new StructType(
            new StructField[] { new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
                    new StructField("sentence", DataTypes.StringType, false, Metadata.empty()) });
    Dataset<Row> sentenceData = spark.createDataFrame(data, schema);

    Tokenizer tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words");
    Dataset<Row> wordsData = tokenizer.transform(sentenceData);

    int numFeatures = 20;
    HashingTF hashingTF = new HashingTF().setInputCol("words").setOutputCol("rawFeatures")
            .setNumFeatures(numFeatures);

    Dataset<Row> featurizedData = hashingTF.transform(wordsData);
    // alternatively, CountVectorizer can also be used to get term frequency vectors

    IDF idf = new IDF().setInputCol("rawFeatures").setOutputCol("features");
    IDFModel idfModel = idf.fit(featurizedData);

    Dataset<Row> rescaledData = idfModel.transform(featurizedData);
    rescaledData.select("label", "features").show();
    // $example off$

    spark.stop();/*from  w ww .  jav  a  2s  . c  om*/
}

From source file:com.bosscs.spark.commons.utils.CellsUtils.java

License:Apache License

private static DataType getDataType(Object value) {
    Class cls = value.getClass();
    DataType dataType;// ww  w.j a  v  a  2 s. c  o m
    if (cls.equals(String.class)) {
        dataType = DataTypes.StringType;
    } else if (cls.equals(Byte[].class)) {
        dataType = DataTypes.BinaryType;
    } else if (cls.equals(Boolean.class)) {
        dataType = DataTypes.BooleanType;
    } else if (cls.equals(Timestamp.class)) {
        dataType = DataTypes.TimestampType;
    } else if (cls.equals(Double.class)) {
        dataType = DataTypes.DoubleType;
    } else if (cls.equals(Float.class)) {
        dataType = DataTypes.FloatType;
    } else if (cls.equals(Byte.class)) {
        dataType = DataTypes.ByteType;
    } else if (cls.equals(Integer.class)) {
        dataType = DataTypes.IntegerType;
    } else if (cls.equals(Long.class)) {
        dataType = DataTypes.LongType;
    } else if (cls.equals(Short.class)) {
        dataType = DataTypes.ShortType;
    } else if (value instanceof List) {
        List listValue = (List) value;
        if (listValue.isEmpty()) {
            dataType = DataTypes.createArrayType(DataTypes.StringType);
        } else {
            dataType = DataTypes.createArrayType(getDataType(listValue.get(0)));
        }
    } else if (value instanceof Map) {
        Map mapValue = (Map) value;
        if (mapValue.isEmpty()) {
            dataType = DataTypes.createMapType(DataTypes.StringType, DataTypes.StringType);
        } else {
            Map.Entry entry = (Map.Entry) mapValue.entrySet().iterator().next();
            dataType = DataTypes.createMapType(getDataType(entry.getKey()), getDataType(entry.getValue()));
        }
    } else {
        dataType = DataTypes.StringType;
    }
    return dataType;
}