Example usage for org.apache.spark.sql.types DataTypes IntegerType

List of usage examples for org.apache.spark.sql.types DataTypes IntegerType

Introduction

In this page you can find the example usage for org.apache.spark.sql.types DataTypes IntegerType.

Prototype

DataType IntegerType

To view the source code for org.apache.spark.sql.types DataTypes IntegerType.

Click Source Link

Document

Gets the IntegerType object.

Usage

From source file:com.andado.spark.examples.ml.JavaBinarizerExample.java

License:Apache License

public static void main(String[] args) {
    SparkSession spark = SparkSession.builder().appName("JavaBinarizerExample").getOrCreate();

    // $example on$
    List<Row> data = Arrays.asList(RowFactory.create(0, 0.1), RowFactory.create(1, 0.8),
            RowFactory.create(2, 0.2));/*from  ww w  .  j ava2s. c o m*/
    StructType schema = new StructType(
            new StructField[] { new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
                    new StructField("feature", DataTypes.DoubleType, false, Metadata.empty()) });
    Dataset<Row> continuousDataFrame = spark.createDataFrame(data, schema);

    Binarizer binarizer = new Binarizer().setInputCol("feature").setOutputCol("binarized_feature")
            .setThreshold(0.5);

    Dataset<Row> binarizedDataFrame = binarizer.transform(continuousDataFrame);

    System.out.println("Binarizer output with Threshold = " + binarizer.getThreshold());
    binarizedDataFrame.show();
    // $example off$

    spark.stop();
}

From source file:com.andado.spark.examples.ml.JavaBucketedRandomProjectionLSHExample.java

License:Apache License

public static void main(String[] args) {
    SparkSession spark = SparkSession.builder().appName("JavaBucketedRandomProjectionLSHExample").getOrCreate();

    // $example on$
    List<Row> dataA = Arrays.asList(RowFactory.create(0, Vectors.dense(1.0, 1.0)),
            RowFactory.create(1, Vectors.dense(1.0, -1.0)), RowFactory.create(2, Vectors.dense(-1.0, -1.0)),
            RowFactory.create(3, Vectors.dense(-1.0, 1.0)));

    List<Row> dataB = Arrays.asList(RowFactory.create(4, Vectors.dense(1.0, 0.0)),
            RowFactory.create(5, Vectors.dense(-1.0, 0.0)), RowFactory.create(6, Vectors.dense(0.0, 1.0)),
            RowFactory.create(7, Vectors.dense(0.0, -1.0)));

    StructType schema = new StructType(
            new StructField[] { new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
                    new StructField("keys", new VectorUDT(), false, Metadata.empty()) });
    Dataset<Row> dfA = spark.createDataFrame(dataA, schema);
    Dataset<Row> dfB = spark.createDataFrame(dataB, schema);

    Vector key = Vectors.dense(1.0, 0.0);

    BucketedRandomProjectionLSH mh = new BucketedRandomProjectionLSH().setBucketLength(2.0).setNumHashTables(3)
            .setInputCol("keys").setOutputCol("values");

    BucketedRandomProjectionLSHModel model = mh.fit(dfA);

    // Feature Transformation
    model.transform(dfA).show();/* w  ww.java2s .com*/
    // Cache the transformed columns
    Dataset<Row> transformedA = model.transform(dfA).cache();
    Dataset<Row> transformedB = model.transform(dfB).cache();

    // Approximate similarity join
    model.approxSimilarityJoin(dfA, dfB, 1.5).show();
    model.approxSimilarityJoin(transformedA, transformedB, 1.5).show();
    // Self Join
    model.approxSimilarityJoin(dfA, dfA, 2.5).filter("datasetA.id < datasetB.id").show();

    // Approximate nearest neighbor search
    model.approxNearestNeighbors(dfA, key, 2).show();
    model.approxNearestNeighbors(transformedA, key, 2).show();
    // $example off$

    spark.stop();
}

From source file:com.andado.spark.examples.ml.JavaChiSqSelectorExample.java

License:Apache License

public static void main(String[] args) {
    SparkSession spark = SparkSession.builder().appName("JavaChiSqSelectorExample").getOrCreate();

    // $example on$
    List<Row> data = Arrays.asList(RowFactory.create(7, Vectors.dense(0.0, 0.0, 18.0, 1.0), 1.0),
            RowFactory.create(8, Vectors.dense(0.0, 1.0, 12.0, 0.0), 0.0),
            RowFactory.create(9, Vectors.dense(1.0, 0.0, 15.0, 0.1), 0.0));
    StructType schema = new StructType(
            new StructField[] { new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
                    new StructField("features", new VectorUDT(), false, Metadata.empty()),
                    new StructField("clicked", DataTypes.DoubleType, false, Metadata.empty()) });

    Dataset<Row> df = spark.createDataFrame(data, schema);

    ChiSqSelector selector = new ChiSqSelector().setNumTopFeatures(1).setFeaturesCol("features")
            .setLabelCol("clicked").setOutputCol("selectedFeatures");

    Dataset<Row> result = selector.fit(df).transform(df);

    System.out.println("ChiSqSelector output with top " + selector.getNumTopFeatures() + " features selected");
    result.show();/*from   www  . j av  a 2 s.c o m*/

    // $example off$
    spark.stop();
}

From source file:com.andado.spark.examples.ml.JavaIndexToStringExample.java

License:Apache License

public static void main(String[] args) {
    SparkSession spark = SparkSession.builder().appName("JavaIndexToStringExample").getOrCreate();

    // $example on$
    List<Row> data = Arrays.asList(RowFactory.create(0, "a"), RowFactory.create(1, "b"),
            RowFactory.create(2, "c"), RowFactory.create(3, "a"), RowFactory.create(4, "a"),
            RowFactory.create(5, "c"));
    StructType schema = new StructType(
            new StructField[] { new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
                    new StructField("category", DataTypes.StringType, false, Metadata.empty()) });
    Dataset<Row> df = spark.createDataFrame(data, schema);

    StringIndexerModel indexer = new StringIndexer().setInputCol("category").setOutputCol("categoryIndex")
            .fit(df);/*from  w w w  .  j ava2 s  .co  m*/
    Dataset<Row> indexed = indexer.transform(df);

    System.out.println("Transformed string column '" + indexer.getInputCol() + "' " + "to indexed column '"
            + indexer.getOutputCol() + "'");
    indexed.show();

    StructField inputColSchema = indexed.schema().apply(indexer.getOutputCol());
    System.out.println("StringIndexer will store labels in output column metadata: "
            + Attribute.fromStructField(inputColSchema).toString() + "\n");

    IndexToString converter = new IndexToString().setInputCol("categoryIndex").setOutputCol("originalCategory");
    Dataset<Row> converted = converter.transform(indexed);

    System.out.println("Transformed indexed column '" + converter.getInputCol() + "' back to "
            + "original string column '" + converter.getOutputCol() + "' using labels in metadata");
    converted.select("id", "categoryIndex", "originalCategory").show();

    // $example off$
    spark.stop();
}

From source file:com.andado.spark.examples.ml.JavaInteractionExample.java

License:Apache License

public static void main(String[] args) {
    SparkSession spark = SparkSession.builder().appName("JavaInteractionExample").getOrCreate();

    // $example on$
    List<Row> data = Arrays.asList(RowFactory.create(1, 1, 2, 3, 8, 4, 5),
            RowFactory.create(2, 4, 3, 8, 7, 9, 8), RowFactory.create(3, 6, 1, 9, 2, 3, 6),
            RowFactory.create(4, 10, 8, 6, 9, 4, 5), RowFactory.create(5, 9, 2, 7, 10, 7, 3),
            RowFactory.create(6, 1, 1, 4, 2, 8, 4));

    StructType schema = new StructType(
            new StructField[] { new StructField("id1", DataTypes.IntegerType, false, Metadata.empty()),
                    new StructField("id2", DataTypes.IntegerType, false, Metadata.empty()),
                    new StructField("id3", DataTypes.IntegerType, false, Metadata.empty()),
                    new StructField("id4", DataTypes.IntegerType, false, Metadata.empty()),
                    new StructField("id5", DataTypes.IntegerType, false, Metadata.empty()),
                    new StructField("id6", DataTypes.IntegerType, false, Metadata.empty()),
                    new StructField("id7", DataTypes.IntegerType, false, Metadata.empty()) });

    Dataset<Row> df = spark.createDataFrame(data, schema);

    VectorAssembler assembler1 = new VectorAssembler().setInputCols(new String[] { "id2", "id3", "id4" })
            .setOutputCol("vec1");

    Dataset<Row> assembled1 = assembler1.transform(df);

    VectorAssembler assembler2 = new VectorAssembler().setInputCols(new String[] { "id5", "id6", "id7" })
            .setOutputCol("vec2");

    Dataset<Row> assembled2 = assembler2.transform(assembled1).select("id1", "vec1", "vec2");

    Interaction interaction = new Interaction().setInputCols(new String[] { "id1", "vec1", "vec2" })
            .setOutputCol("interactedCol");

    Dataset<Row> interacted = interaction.transform(assembled2);

    interacted.show(false);//ww w.jav  a2s . c  o  m
    // $example off$

    spark.stop();
}

From source file:com.andado.spark.examples.ml.JavaMaxAbsScalerExample.java

License:Apache License

public static void main(String[] args) {
    SparkSession spark = SparkSession.builder().appName("JavaMaxAbsScalerExample").getOrCreate();

    // $example on$
    List<Row> data = Arrays.asList(RowFactory.create(0, Vectors.dense(1.0, 0.1, -8.0)),
            RowFactory.create(1, Vectors.dense(2.0, 1.0, -4.0)),
            RowFactory.create(2, Vectors.dense(4.0, 10.0, 8.0)));
    StructType schema = new StructType(
            new StructField[] { new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
                    new StructField("features", new VectorUDT(), false, Metadata.empty()) });
    Dataset<Row> dataFrame = spark.createDataFrame(data, schema);

    MaxAbsScaler scaler = new MaxAbsScaler().setInputCol("features").setOutputCol("scaledFeatures");

    // Compute summary statistics and generate MaxAbsScalerModel
    MaxAbsScalerModel scalerModel = scaler.fit(dataFrame);

    // rescale each feature to range [-1, 1].
    Dataset<Row> scaledData = scalerModel.transform(dataFrame);
    scaledData.select("features", "scaledFeatures").show();
    // $example off$

    spark.stop();// ww  w  .  ja  v  a 2 s. c o  m
}

From source file:com.andado.spark.examples.ml.JavaMinHashLSHExample.java

License:Apache License

public static void main(String[] args) {
    SparkSession spark = SparkSession.builder().appName("JavaMinHashLSHExample").getOrCreate();

    // $example on$
    List<Row> data = Arrays.asList(
            RowFactory.create(0, Vectors.sparse(6, new int[] { 0, 1, 2 }, new double[] { 1.0, 1.0, 1.0 })),
            RowFactory.create(1, Vectors.sparse(6, new int[] { 2, 3, 4 }, new double[] { 1.0, 1.0, 1.0 })),
            RowFactory.create(2, Vectors.sparse(6, new int[] { 0, 2, 4 }, new double[] { 1.0, 1.0, 1.0 })));

    StructType schema = new StructType(
            new StructField[] { new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
                    new StructField("keys", new VectorUDT(), false, Metadata.empty()) });
    Dataset<Row> dataFrame = spark.createDataFrame(data, schema);

    MinHashLSH mh = new MinHashLSH().setNumHashTables(1).setInputCol("keys").setOutputCol("values");

    MinHashLSHModel model = mh.fit(dataFrame);
    model.transform(dataFrame).show();// ww w . j a va 2 s  .  c  om
    // $example off$

    spark.stop();
}

From source file:com.andado.spark.examples.ml.JavaMinMaxScalerExample.java

License:Apache License

public static void main(String[] args) {
    SparkSession spark = SparkSession.builder().appName("JavaMinMaxScalerExample").getOrCreate();

    // $example on$
    List<Row> data = Arrays.asList(RowFactory.create(0, Vectors.dense(1.0, 0.1, -1.0)),
            RowFactory.create(1, Vectors.dense(2.0, 1.1, 1.0)),
            RowFactory.create(2, Vectors.dense(3.0, 10.1, 3.0)));
    StructType schema = new StructType(
            new StructField[] { new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
                    new StructField("features", new VectorUDT(), false, Metadata.empty()) });
    Dataset<Row> dataFrame = spark.createDataFrame(data, schema);

    MinMaxScaler scaler = new MinMaxScaler().setInputCol("features").setOutputCol("scaledFeatures");

    // Compute summary statistics and generate MinMaxScalerModel
    MinMaxScalerModel scalerModel = scaler.fit(dataFrame);

    // rescale each feature to range [min, max].
    Dataset<Row> scaledData = scalerModel.transform(dataFrame);
    System.out.println("Features scaled to range: [" + scaler.getMin() + ", " + scaler.getMax() + "]");
    scaledData.select("features", "scaledFeatures").show();
    // $example off$

    spark.stop();/*from  www. j  a  va  2s.  c om*/
}

From source file:com.andado.spark.examples.ml.JavaNGramExample.java

License:Apache License

public static void main(String[] args) {
    SparkSession spark = SparkSession.builder().appName("JavaNGramExample").getOrCreate();

    // $example on$
    List<Row> data = Arrays.asList(RowFactory.create(0, Arrays.asList("Hi", "I", "heard", "about", "Spark")),
            RowFactory.create(1, Arrays.asList("I", "wish", "Java", "could", "use", "case", "classes")),
            RowFactory.create(2, Arrays.asList("Logistic", "regression", "models", "are", "neat")));

    StructType schema = new StructType(new StructField[] {
            new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), new StructField("words",
                    DataTypes.createArrayType(DataTypes.StringType), false, Metadata.empty()) });

    Dataset<Row> wordDataFrame = spark.createDataFrame(data, schema);

    NGram ngramTransformer = new NGram().setN(2).setInputCol("words").setOutputCol("ngrams");

    Dataset<Row> ngramDataFrame = ngramTransformer.transform(wordDataFrame);
    ngramDataFrame.select("ngrams").show(false);
    // $example off$

    spark.stop();//from w ww .j a  va2  s. c  o m
}

From source file:com.andado.spark.examples.ml.JavaNormalizerExample.java

License:Apache License

public static void main(String[] args) {
    SparkSession spark = SparkSession.builder().appName("JavaNormalizerExample").getOrCreate();

    // $example on$
    List<Row> data = Arrays.asList(RowFactory.create(0, Vectors.dense(1.0, 0.1, -8.0)),
            RowFactory.create(1, Vectors.dense(2.0, 1.0, -4.0)),
            RowFactory.create(2, Vectors.dense(4.0, 10.0, 8.0)));
    StructType schema = new StructType(
            new StructField[] { new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
                    new StructField("features", new VectorUDT(), false, Metadata.empty()) });
    Dataset<Row> dataFrame = spark.createDataFrame(data, schema);

    // Normalize each Vector using $L^1$ norm.
    Normalizer normalizer = new Normalizer().setInputCol("features").setOutputCol("normFeatures").setP(1.0);

    Dataset<Row> l1NormData = normalizer.transform(dataFrame);
    l1NormData.show();//from   www  . j a va2  s. c o m

    // Normalize each Vector using $L^\infty$ norm.
    Dataset<Row> lInfNormData = normalizer.transform(dataFrame, normalizer.p().w(Double.POSITIVE_INFINITY));
    lInfNormData.show();
    // $example off$

    spark.stop();
}