List of usage examples for org.apache.spark.sql.types DataTypes IntegerType
DataType IntegerType
To view the source code for org.apache.spark.sql.types DataTypes IntegerType.
Click Source Link
From source file:com.andado.spark.examples.ml.JavaBinarizerExample.java
License:Apache License
public static void main(String[] args) { SparkSession spark = SparkSession.builder().appName("JavaBinarizerExample").getOrCreate(); // $example on$ List<Row> data = Arrays.asList(RowFactory.create(0, 0.1), RowFactory.create(1, 0.8), RowFactory.create(2, 0.2));/*from ww w . j ava2s. c o m*/ StructType schema = new StructType( new StructField[] { new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), new StructField("feature", DataTypes.DoubleType, false, Metadata.empty()) }); Dataset<Row> continuousDataFrame = spark.createDataFrame(data, schema); Binarizer binarizer = new Binarizer().setInputCol("feature").setOutputCol("binarized_feature") .setThreshold(0.5); Dataset<Row> binarizedDataFrame = binarizer.transform(continuousDataFrame); System.out.println("Binarizer output with Threshold = " + binarizer.getThreshold()); binarizedDataFrame.show(); // $example off$ spark.stop(); }
From source file:com.andado.spark.examples.ml.JavaBucketedRandomProjectionLSHExample.java
License:Apache License
public static void main(String[] args) { SparkSession spark = SparkSession.builder().appName("JavaBucketedRandomProjectionLSHExample").getOrCreate(); // $example on$ List<Row> dataA = Arrays.asList(RowFactory.create(0, Vectors.dense(1.0, 1.0)), RowFactory.create(1, Vectors.dense(1.0, -1.0)), RowFactory.create(2, Vectors.dense(-1.0, -1.0)), RowFactory.create(3, Vectors.dense(-1.0, 1.0))); List<Row> dataB = Arrays.asList(RowFactory.create(4, Vectors.dense(1.0, 0.0)), RowFactory.create(5, Vectors.dense(-1.0, 0.0)), RowFactory.create(6, Vectors.dense(0.0, 1.0)), RowFactory.create(7, Vectors.dense(0.0, -1.0))); StructType schema = new StructType( new StructField[] { new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), new StructField("keys", new VectorUDT(), false, Metadata.empty()) }); Dataset<Row> dfA = spark.createDataFrame(dataA, schema); Dataset<Row> dfB = spark.createDataFrame(dataB, schema); Vector key = Vectors.dense(1.0, 0.0); BucketedRandomProjectionLSH mh = new BucketedRandomProjectionLSH().setBucketLength(2.0).setNumHashTables(3) .setInputCol("keys").setOutputCol("values"); BucketedRandomProjectionLSHModel model = mh.fit(dfA); // Feature Transformation model.transform(dfA).show();/* w ww.java2s .com*/ // Cache the transformed columns Dataset<Row> transformedA = model.transform(dfA).cache(); Dataset<Row> transformedB = model.transform(dfB).cache(); // Approximate similarity join model.approxSimilarityJoin(dfA, dfB, 1.5).show(); model.approxSimilarityJoin(transformedA, transformedB, 1.5).show(); // Self Join model.approxSimilarityJoin(dfA, dfA, 2.5).filter("datasetA.id < datasetB.id").show(); // Approximate nearest neighbor search model.approxNearestNeighbors(dfA, key, 2).show(); model.approxNearestNeighbors(transformedA, key, 2).show(); // $example off$ spark.stop(); }
From source file:com.andado.spark.examples.ml.JavaChiSqSelectorExample.java
License:Apache License
public static void main(String[] args) { SparkSession spark = SparkSession.builder().appName("JavaChiSqSelectorExample").getOrCreate(); // $example on$ List<Row> data = Arrays.asList(RowFactory.create(7, Vectors.dense(0.0, 0.0, 18.0, 1.0), 1.0), RowFactory.create(8, Vectors.dense(0.0, 1.0, 12.0, 0.0), 0.0), RowFactory.create(9, Vectors.dense(1.0, 0.0, 15.0, 0.1), 0.0)); StructType schema = new StructType( new StructField[] { new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), new StructField("features", new VectorUDT(), false, Metadata.empty()), new StructField("clicked", DataTypes.DoubleType, false, Metadata.empty()) }); Dataset<Row> df = spark.createDataFrame(data, schema); ChiSqSelector selector = new ChiSqSelector().setNumTopFeatures(1).setFeaturesCol("features") .setLabelCol("clicked").setOutputCol("selectedFeatures"); Dataset<Row> result = selector.fit(df).transform(df); System.out.println("ChiSqSelector output with top " + selector.getNumTopFeatures() + " features selected"); result.show();/*from www . j av a 2 s.c o m*/ // $example off$ spark.stop(); }
From source file:com.andado.spark.examples.ml.JavaIndexToStringExample.java
License:Apache License
public static void main(String[] args) { SparkSession spark = SparkSession.builder().appName("JavaIndexToStringExample").getOrCreate(); // $example on$ List<Row> data = Arrays.asList(RowFactory.create(0, "a"), RowFactory.create(1, "b"), RowFactory.create(2, "c"), RowFactory.create(3, "a"), RowFactory.create(4, "a"), RowFactory.create(5, "c")); StructType schema = new StructType( new StructField[] { new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), new StructField("category", DataTypes.StringType, false, Metadata.empty()) }); Dataset<Row> df = spark.createDataFrame(data, schema); StringIndexerModel indexer = new StringIndexer().setInputCol("category").setOutputCol("categoryIndex") .fit(df);/*from w w w . j ava2 s .co m*/ Dataset<Row> indexed = indexer.transform(df); System.out.println("Transformed string column '" + indexer.getInputCol() + "' " + "to indexed column '" + indexer.getOutputCol() + "'"); indexed.show(); StructField inputColSchema = indexed.schema().apply(indexer.getOutputCol()); System.out.println("StringIndexer will store labels in output column metadata: " + Attribute.fromStructField(inputColSchema).toString() + "\n"); IndexToString converter = new IndexToString().setInputCol("categoryIndex").setOutputCol("originalCategory"); Dataset<Row> converted = converter.transform(indexed); System.out.println("Transformed indexed column '" + converter.getInputCol() + "' back to " + "original string column '" + converter.getOutputCol() + "' using labels in metadata"); converted.select("id", "categoryIndex", "originalCategory").show(); // $example off$ spark.stop(); }
From source file:com.andado.spark.examples.ml.JavaInteractionExample.java
License:Apache License
public static void main(String[] args) { SparkSession spark = SparkSession.builder().appName("JavaInteractionExample").getOrCreate(); // $example on$ List<Row> data = Arrays.asList(RowFactory.create(1, 1, 2, 3, 8, 4, 5), RowFactory.create(2, 4, 3, 8, 7, 9, 8), RowFactory.create(3, 6, 1, 9, 2, 3, 6), RowFactory.create(4, 10, 8, 6, 9, 4, 5), RowFactory.create(5, 9, 2, 7, 10, 7, 3), RowFactory.create(6, 1, 1, 4, 2, 8, 4)); StructType schema = new StructType( new StructField[] { new StructField("id1", DataTypes.IntegerType, false, Metadata.empty()), new StructField("id2", DataTypes.IntegerType, false, Metadata.empty()), new StructField("id3", DataTypes.IntegerType, false, Metadata.empty()), new StructField("id4", DataTypes.IntegerType, false, Metadata.empty()), new StructField("id5", DataTypes.IntegerType, false, Metadata.empty()), new StructField("id6", DataTypes.IntegerType, false, Metadata.empty()), new StructField("id7", DataTypes.IntegerType, false, Metadata.empty()) }); Dataset<Row> df = spark.createDataFrame(data, schema); VectorAssembler assembler1 = new VectorAssembler().setInputCols(new String[] { "id2", "id3", "id4" }) .setOutputCol("vec1"); Dataset<Row> assembled1 = assembler1.transform(df); VectorAssembler assembler2 = new VectorAssembler().setInputCols(new String[] { "id5", "id6", "id7" }) .setOutputCol("vec2"); Dataset<Row> assembled2 = assembler2.transform(assembled1).select("id1", "vec1", "vec2"); Interaction interaction = new Interaction().setInputCols(new String[] { "id1", "vec1", "vec2" }) .setOutputCol("interactedCol"); Dataset<Row> interacted = interaction.transform(assembled2); interacted.show(false);//ww w.jav a2s . c o m // $example off$ spark.stop(); }
From source file:com.andado.spark.examples.ml.JavaMaxAbsScalerExample.java
License:Apache License
public static void main(String[] args) { SparkSession spark = SparkSession.builder().appName("JavaMaxAbsScalerExample").getOrCreate(); // $example on$ List<Row> data = Arrays.asList(RowFactory.create(0, Vectors.dense(1.0, 0.1, -8.0)), RowFactory.create(1, Vectors.dense(2.0, 1.0, -4.0)), RowFactory.create(2, Vectors.dense(4.0, 10.0, 8.0))); StructType schema = new StructType( new StructField[] { new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), new StructField("features", new VectorUDT(), false, Metadata.empty()) }); Dataset<Row> dataFrame = spark.createDataFrame(data, schema); MaxAbsScaler scaler = new MaxAbsScaler().setInputCol("features").setOutputCol("scaledFeatures"); // Compute summary statistics and generate MaxAbsScalerModel MaxAbsScalerModel scalerModel = scaler.fit(dataFrame); // rescale each feature to range [-1, 1]. Dataset<Row> scaledData = scalerModel.transform(dataFrame); scaledData.select("features", "scaledFeatures").show(); // $example off$ spark.stop();// ww w . ja v a 2 s. c o m }
From source file:com.andado.spark.examples.ml.JavaMinHashLSHExample.java
License:Apache License
public static void main(String[] args) { SparkSession spark = SparkSession.builder().appName("JavaMinHashLSHExample").getOrCreate(); // $example on$ List<Row> data = Arrays.asList( RowFactory.create(0, Vectors.sparse(6, new int[] { 0, 1, 2 }, new double[] { 1.0, 1.0, 1.0 })), RowFactory.create(1, Vectors.sparse(6, new int[] { 2, 3, 4 }, new double[] { 1.0, 1.0, 1.0 })), RowFactory.create(2, Vectors.sparse(6, new int[] { 0, 2, 4 }, new double[] { 1.0, 1.0, 1.0 }))); StructType schema = new StructType( new StructField[] { new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), new StructField("keys", new VectorUDT(), false, Metadata.empty()) }); Dataset<Row> dataFrame = spark.createDataFrame(data, schema); MinHashLSH mh = new MinHashLSH().setNumHashTables(1).setInputCol("keys").setOutputCol("values"); MinHashLSHModel model = mh.fit(dataFrame); model.transform(dataFrame).show();// ww w . j a va 2 s . c om // $example off$ spark.stop(); }
From source file:com.andado.spark.examples.ml.JavaMinMaxScalerExample.java
License:Apache License
public static void main(String[] args) { SparkSession spark = SparkSession.builder().appName("JavaMinMaxScalerExample").getOrCreate(); // $example on$ List<Row> data = Arrays.asList(RowFactory.create(0, Vectors.dense(1.0, 0.1, -1.0)), RowFactory.create(1, Vectors.dense(2.0, 1.1, 1.0)), RowFactory.create(2, Vectors.dense(3.0, 10.1, 3.0))); StructType schema = new StructType( new StructField[] { new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), new StructField("features", new VectorUDT(), false, Metadata.empty()) }); Dataset<Row> dataFrame = spark.createDataFrame(data, schema); MinMaxScaler scaler = new MinMaxScaler().setInputCol("features").setOutputCol("scaledFeatures"); // Compute summary statistics and generate MinMaxScalerModel MinMaxScalerModel scalerModel = scaler.fit(dataFrame); // rescale each feature to range [min, max]. Dataset<Row> scaledData = scalerModel.transform(dataFrame); System.out.println("Features scaled to range: [" + scaler.getMin() + ", " + scaler.getMax() + "]"); scaledData.select("features", "scaledFeatures").show(); // $example off$ spark.stop();/*from www. j a va 2s. c om*/ }
From source file:com.andado.spark.examples.ml.JavaNGramExample.java
License:Apache License
public static void main(String[] args) { SparkSession spark = SparkSession.builder().appName("JavaNGramExample").getOrCreate(); // $example on$ List<Row> data = Arrays.asList(RowFactory.create(0, Arrays.asList("Hi", "I", "heard", "about", "Spark")), RowFactory.create(1, Arrays.asList("I", "wish", "Java", "could", "use", "case", "classes")), RowFactory.create(2, Arrays.asList("Logistic", "regression", "models", "are", "neat"))); StructType schema = new StructType(new StructField[] { new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), new StructField("words", DataTypes.createArrayType(DataTypes.StringType), false, Metadata.empty()) }); Dataset<Row> wordDataFrame = spark.createDataFrame(data, schema); NGram ngramTransformer = new NGram().setN(2).setInputCol("words").setOutputCol("ngrams"); Dataset<Row> ngramDataFrame = ngramTransformer.transform(wordDataFrame); ngramDataFrame.select("ngrams").show(false); // $example off$ spark.stop();//from w ww .j a va2 s. c o m }
From source file:com.andado.spark.examples.ml.JavaNormalizerExample.java
License:Apache License
public static void main(String[] args) { SparkSession spark = SparkSession.builder().appName("JavaNormalizerExample").getOrCreate(); // $example on$ List<Row> data = Arrays.asList(RowFactory.create(0, Vectors.dense(1.0, 0.1, -8.0)), RowFactory.create(1, Vectors.dense(2.0, 1.0, -4.0)), RowFactory.create(2, Vectors.dense(4.0, 10.0, 8.0))); StructType schema = new StructType( new StructField[] { new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), new StructField("features", new VectorUDT(), false, Metadata.empty()) }); Dataset<Row> dataFrame = spark.createDataFrame(data, schema); // Normalize each Vector using $L^1$ norm. Normalizer normalizer = new Normalizer().setInputCol("features").setOutputCol("normFeatures").setP(1.0); Dataset<Row> l1NormData = normalizer.transform(dataFrame); l1NormData.show();//from www . j a va2 s. c o m // Normalize each Vector using $L^\infty$ norm. Dataset<Row> lInfNormData = normalizer.transform(dataFrame, normalizer.p().w(Double.POSITIVE_INFINITY)); lInfNormData.show(); // $example off$ spark.stop(); }