List of usage examples for org.apache.spark.sql.types DataTypes DoubleType
DataType DoubleType
To view the source code for org.apache.spark.sql.types DataTypes DoubleType.
Click Source Link
From source file:KafkaSparkMongo.java
License:Apache License
public static void main(String[] args) throws Exception { if (args.length < 2) { System.err.println("Usage: JavaDirectKafkaWordCount <brokers> <topics>\n" + " <brokers> is a list of one or more Kafka brokers\n" + " <topics> is a list of one or more kafka topics to consume from\n\n"); System.exit(1);/*from ww w . j a va2 s . com*/ } String brokers = args[0]; String topics = args[1]; String UriMongo = "mongodb://localhost/streamSparkFinal.coll"; dropDatabase(UriMongo); // Create the context with a 1 second batch size SparkConf sparkConf = new SparkConf().setAppName("JavaNetworkWordCount") .set("spark.app.id", "MongoSparkConnectorTour").set("spark.mongodb.input.uri", UriMongo) .set("spark.mongodb.output.uri", UriMongo); JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, Durations.seconds(5)); /** Create a JavaReceiverInputDStream on target ip:port and count the * words in input stream of \n delimited text (eg. generated by 'nc') * Note that no duplication in storage level only for running locally. * Replication necessary in distributed scenario for fault tolerance. */ Set<String> topicsSet = new HashSet<>(Arrays.asList(topics.split(","))); Map<String, String> kafkaParams = new HashMap<>(); kafkaParams.put("metadata.broker.list", brokers); // Create direct kafka stream with brokers and topics JavaPairInputDStream<String, String> messages = KafkaUtils.createDirectStream(ssc, String.class, String.class, StringDecoder.class, StringDecoder.class, kafkaParams, topicsSet); messages.print(); JavaDStream<String> lines = messages.map(x -> x._2()); JavaDStream<Tuple7<String, String, String, String, String, String, String>> words = lines.map(y -> { String[] wordy = SPACE.split(y); return new Tuple7<>(wordy[0], wordy[1], wordy[2], wordy[3], wordy[4], wordy[5], wordy[6]); }); words.foreachRDD(rdd -> { List<StructField> subFields = new ArrayList<>(); subFields.add(DataTypes.createStructField("X", DataTypes.DoubleType, true)); subFields.add(DataTypes.createStructField("Y", DataTypes.DoubleType, true)); subFields.add(DataTypes.createStructField("z", DataTypes.DoubleType, true)); List<StructField> fields = new ArrayList<>(); fields.add(DataTypes.createStructField("Serial", DataTypes.StringType, true)); fields.add(DataTypes.createStructField("Zone", DataTypes.StringType, true)); fields.add(DataTypes.createStructField("Group", DataTypes.StringType, true)); fields.add(DataTypes.createStructField("coord", DataTypes.createStructType(subFields), true)); fields.add(DataTypes.createStructField("Time", DataTypes.TimestampType, true)); StructType schema = DataTypes.createStructType(fields); SparkSession spark = JavaSparkSessionSingleton.getInstance(rdd.context().getConf()); JavaRDD<Row> rowRDD = rdd .map(palabra -> RowFactory.create(palabra._1(), palabra._2(), palabra._3(), RowFactory.create(Double.parseDouble(palabra._4()), Double.parseDouble(palabra._5()), Double.parseDouble(palabra._6())), Timestamp.from(Instant.parse(palabra._7())))); Dataset<Row> wordsDataFrame = spark.createDataFrame(rowRDD, schema); wordsDataFrame.show(); MongoSpark.write(wordsDataFrame).option("collection", "pruebaF").mode("append").save(); }); ssc.start(); ssc.awaitTermination(); }
From source file:com.andado.spark.examples.ml.JavaAFTSurvivalRegressionExample.java
License:Apache License
public static void main(String[] args) { SparkSession spark = SparkSession.builder().appName("JavaAFTSurvivalRegressionExample").getOrCreate(); // $example on$ List<Row> data = Arrays.asList(RowFactory.create(1.218, 1.0, Vectors.dense(1.560, -0.605)), RowFactory.create(2.949, 0.0, Vectors.dense(0.346, 2.158)), RowFactory.create(3.627, 0.0, Vectors.dense(1.380, 0.231)), RowFactory.create(0.273, 1.0, Vectors.dense(0.520, 1.151)), RowFactory.create(4.199, 0.0, Vectors.dense(0.795, -0.226))); StructType schema = new StructType( new StructField[] { new StructField("label", DataTypes.DoubleType, false, Metadata.empty()), new StructField("censor", DataTypes.DoubleType, false, Metadata.empty()), new StructField("features", new VectorUDT(), false, Metadata.empty()) }); Dataset<Row> training = spark.createDataFrame(data, schema); double[] quantileProbabilities = new double[] { 0.3, 0.6 }; AFTSurvivalRegression aft = new AFTSurvivalRegression().setQuantileProbabilities(quantileProbabilities) .setQuantilesCol("quantiles"); AFTSurvivalRegressionModel model = aft.fit(training); // Print the coefficients, intercept and scale parameter for AFT survival regression System.out.println("Coefficients: " + model.coefficients()); System.out.println("Intercept: " + model.intercept()); System.out.println("Scale: " + model.scale()); model.transform(training).show(false); // $example off$ spark.stop();/*from ww w. j a va 2 s .co m*/ }
From source file:com.andado.spark.examples.ml.JavaBinarizerExample.java
License:Apache License
public static void main(String[] args) { SparkSession spark = SparkSession.builder().appName("JavaBinarizerExample").getOrCreate(); // $example on$ List<Row> data = Arrays.asList(RowFactory.create(0, 0.1), RowFactory.create(1, 0.8), RowFactory.create(2, 0.2));/*from w ww .j a v a 2s. c o m*/ StructType schema = new StructType( new StructField[] { new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), new StructField("feature", DataTypes.DoubleType, false, Metadata.empty()) }); Dataset<Row> continuousDataFrame = spark.createDataFrame(data, schema); Binarizer binarizer = new Binarizer().setInputCol("feature").setOutputCol("binarized_feature") .setThreshold(0.5); Dataset<Row> binarizedDataFrame = binarizer.transform(continuousDataFrame); System.out.println("Binarizer output with Threshold = " + binarizer.getThreshold()); binarizedDataFrame.show(); // $example off$ spark.stop(); }
From source file:com.andado.spark.examples.ml.JavaBucketizerExample.java
License:Apache License
public static void main(String[] args) { SparkSession spark = SparkSession.builder().appName("JavaBucketizerExample").getOrCreate(); // $example on$ double[] splits = { Double.NEGATIVE_INFINITY, -0.5, 0.0, 0.5, Double.POSITIVE_INFINITY }; List<Row> data = Arrays.asList(RowFactory.create(-999.9), RowFactory.create(-0.5), RowFactory.create(-0.3), RowFactory.create(0.0), RowFactory.create(0.2), RowFactory.create(999.9)); StructType schema = new StructType( new StructField[] { new StructField("features", DataTypes.DoubleType, false, Metadata.empty()) }); Dataset<Row> dataFrame = spark.createDataFrame(data, schema); Bucketizer bucketizer = new Bucketizer().setInputCol("features").setOutputCol("bucketedFeatures") .setSplits(splits);//from w w w. j a va2 s . com // Transform original data into its bucket index. Dataset<Row> bucketedData = bucketizer.transform(dataFrame); System.out.println("Bucketizer output with " + (bucketizer.getSplits().length - 1) + " buckets"); bucketedData.show(); // $example off$ spark.stop(); }
From source file:com.andado.spark.examples.ml.JavaChiSqSelectorExample.java
License:Apache License
public static void main(String[] args) { SparkSession spark = SparkSession.builder().appName("JavaChiSqSelectorExample").getOrCreate(); // $example on$ List<Row> data = Arrays.asList(RowFactory.create(7, Vectors.dense(0.0, 0.0, 18.0, 1.0), 1.0), RowFactory.create(8, Vectors.dense(0.0, 1.0, 12.0, 0.0), 0.0), RowFactory.create(9, Vectors.dense(1.0, 0.0, 15.0, 0.1), 0.0)); StructType schema = new StructType( new StructField[] { new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), new StructField("features", new VectorUDT(), false, Metadata.empty()), new StructField("clicked", DataTypes.DoubleType, false, Metadata.empty()) }); Dataset<Row> df = spark.createDataFrame(data, schema); ChiSqSelector selector = new ChiSqSelector().setNumTopFeatures(1).setFeaturesCol("features") .setLabelCol("clicked").setOutputCol("selectedFeatures"); Dataset<Row> result = selector.fit(df).transform(df); System.out.println("ChiSqSelector output with top " + selector.getNumTopFeatures() + " features selected"); result.show();//w ww . j a va2s . c o m // $example off$ spark.stop(); }
From source file:com.andado.spark.examples.ml.JavaEstimatorTransformerParamExample.java
License:Apache License
public static void main(String[] args) { SparkSession spark = SparkSession.builder().appName("JavaEstimatorTransformerParamExample").getOrCreate(); // $example on$ // Prepare training data. List<Row> dataTraining = Arrays.asList(RowFactory.create(1.0, Vectors.dense(0.0, 1.1, 0.1)), RowFactory.create(0.0, Vectors.dense(2.0, 1.0, -1.0)), RowFactory.create(0.0, Vectors.dense(2.0, 1.3, 1.0)), RowFactory.create(1.0, Vectors.dense(0.0, 1.2, -0.5))); StructType schema = new StructType( new StructField[] { new StructField("label", DataTypes.DoubleType, false, Metadata.empty()), new StructField("features", new VectorUDT(), false, Metadata.empty()) }); Dataset<Row> training = spark.createDataFrame(dataTraining, schema); // Create a LogisticRegression instance. This instance is an Estimator. LogisticRegression lr = new LogisticRegression(); // Print out the parameters, documentation, and any default values. System.out.println("LogisticRegression parameters:\n" + lr.explainParams() + "\n"); // We may set parameters using setter methods. lr.setMaxIter(10).setRegParam(0.01); // Learn a LogisticRegression model. This uses the parameters stored in lr. LogisticRegressionModel model1 = lr.fit(training); // Since model1 is a Model (i.e., a Transformer produced by an Estimator), // we can view the parameters it used during fit(). // This prints the parameter (name: value) pairs, where names are unique IDs for this // LogisticRegression instance. System.out.println("Model 1 was fit using parameters: " + model1.parent().extractParamMap()); // We may alternatively specify parameters using a ParamMap. ParamMap paramMap = new ParamMap().put(lr.maxIter().w(20)) // Specify 1 Param. .put(lr.maxIter(), 30) // This overwrites the original maxIter. .put(lr.regParam().w(0.1), lr.threshold().w(0.55)); // Specify multiple Params. // One can also combine ParamMaps. ParamMap paramMap2 = new ParamMap().put(lr.probabilityCol().w("myProbability")); // Change output column name ParamMap paramMapCombined = paramMap.$plus$plus(paramMap2); // Now learn a new model using the paramMapCombined parameters. // paramMapCombined overrides all parameters set earlier via lr.set* methods. LogisticRegressionModel model2 = lr.fit(training, paramMapCombined); System.out.println("Model 2 was fit using parameters: " + model2.parent().extractParamMap()); // Prepare test documents. List<Row> dataTest = Arrays.asList(RowFactory.create(1.0, Vectors.dense(-1.0, 1.5, 1.3)), RowFactory.create(0.0, Vectors.dense(3.0, 2.0, -0.1)), RowFactory.create(1.0, Vectors.dense(0.0, 2.2, -1.5))); Dataset<Row> test = spark.createDataFrame(dataTest, schema); // Make predictions on test documents using the Transformer.transform() method. // LogisticRegression.transform will only use the 'features' column. // Note that model2.transform() outputs a 'myProbability' column instead of the usual // 'probability' column since we renamed the lr.probabilityCol parameter previously. Dataset<Row> results = model2.transform(test); Dataset<Row> rows = results.select("features", "label", "myProbability", "prediction"); for (Row r : rows.collectAsList()) { System.out.println(/*from w ww . jav a 2 s .c o m*/ "(" + r.get(0) + ", " + r.get(1) + ") -> prob=" + r.get(2) + ", prediction=" + r.get(3)); } // $example off$ spark.stop(); }
From source file:com.andado.spark.examples.ml.JavaQuantileDiscretizerExample.java
License:Apache License
public static void main(String[] args) { SparkSession spark = SparkSession.builder().appName("JavaQuantileDiscretizerExample").getOrCreate(); // $example on$ List<Row> data = Arrays.asList(RowFactory.create(0, 18.0), RowFactory.create(1, 19.0), RowFactory.create(2, 8.0), RowFactory.create(3, 5.0), RowFactory.create(4, 2.2)); StructType schema = new StructType( new StructField[] { new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), new StructField("hour", DataTypes.DoubleType, false, Metadata.empty()) }); Dataset<Row> df = spark.createDataFrame(data, schema); // $example off$ // Output of QuantileDiscretizer for such small datasets can depend on the number of // partitions. Here we force a single partition to ensure consistent results. // Note this is not necessary for normal use cases df = df.repartition(1);// w w w . j a v a2 s . c om // $example on$ QuantileDiscretizer discretizer = new QuantileDiscretizer().setInputCol("hour").setOutputCol("result") .setNumBuckets(3); Dataset<Row> result = discretizer.fit(df).transform(df); result.show(); // $example off$ spark.stop(); }
From source file:com.andado.spark.examples.ml.JavaSQLTransformerExample.java
License:Apache License
public static void main(String[] args) { SparkSession spark = SparkSession.builder().appName("JavaSQLTransformerExample").getOrCreate(); // $example on$ List<Row> data = Arrays.asList(RowFactory.create(0, 1.0, 3.0), RowFactory.create(2, 2.0, 5.0)); StructType schema = new StructType( new StructField[] { new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), new StructField("v1", DataTypes.DoubleType, false, Metadata.empty()), new StructField("v2", DataTypes.DoubleType, false, Metadata.empty()) }); Dataset<Row> df = spark.createDataFrame(data, schema); SQLTransformer sqlTrans = new SQLTransformer() .setStatement("SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__"); sqlTrans.transform(df).show();/*from w w w . j a v a2s. c o m*/ // $example off$ spark.stop(); }
From source file:com.andado.spark.examples.ml.JavaTfIdfExample.java
License:Apache License
public static void main(String[] args) { SparkSession spark = SparkSession.builder().appName("JavaTfIdfExample").getOrCreate(); // $example on$ List<Row> data = Arrays.asList(RowFactory.create(0.0, "Hi I heard about Spark"), RowFactory.create(0.0, "I wish Java could use case classes"), RowFactory.create(1.0, "Logistic regression models are neat")); StructType schema = new StructType( new StructField[] { new StructField("label", DataTypes.DoubleType, false, Metadata.empty()), new StructField("sentence", DataTypes.StringType, false, Metadata.empty()) }); Dataset<Row> sentenceData = spark.createDataFrame(data, schema); Tokenizer tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words"); Dataset<Row> wordsData = tokenizer.transform(sentenceData); int numFeatures = 20; HashingTF hashingTF = new HashingTF().setInputCol("words").setOutputCol("rawFeatures") .setNumFeatures(numFeatures); Dataset<Row> featurizedData = hashingTF.transform(wordsData); // alternatively, CountVectorizer can also be used to get term frequency vectors IDF idf = new IDF().setInputCol("rawFeatures").setOutputCol("features"); IDFModel idfModel = idf.fit(featurizedData); Dataset<Row> rescaledData = idfModel.transform(featurizedData); rescaledData.select("label", "features").show(); // $example off$ spark.stop();/*from w ww . jav a 2s . c om*/ }
From source file:com.bosscs.spark.commons.utils.CellsUtils.java
License:Apache License
private static DataType getDataType(Object value) { Class cls = value.getClass(); DataType dataType;// ww w.j a v a 2 s. c o m if (cls.equals(String.class)) { dataType = DataTypes.StringType; } else if (cls.equals(Byte[].class)) { dataType = DataTypes.BinaryType; } else if (cls.equals(Boolean.class)) { dataType = DataTypes.BooleanType; } else if (cls.equals(Timestamp.class)) { dataType = DataTypes.TimestampType; } else if (cls.equals(Double.class)) { dataType = DataTypes.DoubleType; } else if (cls.equals(Float.class)) { dataType = DataTypes.FloatType; } else if (cls.equals(Byte.class)) { dataType = DataTypes.ByteType; } else if (cls.equals(Integer.class)) { dataType = DataTypes.IntegerType; } else if (cls.equals(Long.class)) { dataType = DataTypes.LongType; } else if (cls.equals(Short.class)) { dataType = DataTypes.ShortType; } else if (value instanceof List) { List listValue = (List) value; if (listValue.isEmpty()) { dataType = DataTypes.createArrayType(DataTypes.StringType); } else { dataType = DataTypes.createArrayType(getDataType(listValue.get(0))); } } else if (value instanceof Map) { Map mapValue = (Map) value; if (mapValue.isEmpty()) { dataType = DataTypes.createMapType(DataTypes.StringType, DataTypes.StringType); } else { Map.Entry entry = (Map.Entry) mapValue.entrySet().iterator().next(); dataType = DataTypes.createMapType(getDataType(entry.getKey()), getDataType(entry.getValue())); } } else { dataType = DataTypes.StringType; } return dataType; }