Example usage for org.apache.spark.sql.types DataTypes StringType

List of usage examples for org.apache.spark.sql.types DataTypes StringType

Introduction

In this page you can find the example usage for org.apache.spark.sql.types DataTypes StringType.

Prototype

DataType StringType

To view the source code for org.apache.spark.sql.types DataTypes StringType.

Click Source Link

Document

Gets the StringType object.

Usage

From source file:JavaWord2VecExample.java

License:Apache License

public static void main(String[] args) {

    SparkConf conf = new SparkConf().setAppName("JavaWord2VecExample");
    JavaSparkContext jsc = new JavaSparkContext(conf);
    SQLContext sqlContext = new SQLContext(jsc);

    // $example on$
    // Input data: Each row is a bag of words from a sentence or document.
    JavaRDD<Row> jrdd = jsc// w  w w. j  a  v a2s. c o m
            .parallelize(Arrays.asList(RowFactory.create(Arrays.asList("Hi I heard about Spark".split(" "))),
                    RowFactory.create(Arrays.asList("I wish Java could use case classes".split(" "))),
                    RowFactory.create(Arrays.asList("Logistic regression models are neat".split(" ")))));
    StructType schema = new StructType(new StructField[] {
            new StructField("text", new ArrayType(DataTypes.StringType, true), false, Metadata.empty()) });
    DataFrame documentDF = sqlContext.createDataFrame(jrdd, schema);

    // Learn a mapping from words to Vectors.
    Word2Vec word2Vec = new Word2Vec().setInputCol("text").setOutputCol("result").setVectorSize(3)
            .setMinCount(0);
    Word2VecModel model = word2Vec.fit(documentDF);
    DataFrame result = model.transform(documentDF);
    for (Row r : result.select("result").take(3)) {
        System.out.println(r);
    }
    // $example off$
}

From source file:KafkaSparkMongo.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (args.length < 2) {
        System.err.println("Usage: JavaDirectKafkaWordCount <brokers> <topics>\n"
                + "  <brokers> is a list of one or more Kafka brokers\n"
                + "  <topics> is a list of one or more kafka topics to consume from\n\n");
        System.exit(1);/*  w  w w  .  j ava  2s .co  m*/
    }

    String brokers = args[0];
    String topics = args[1];

    String UriMongo = "mongodb://localhost/streamSparkFinal.coll";
    dropDatabase(UriMongo);

    // Create the context with a 1 second batch size
    SparkConf sparkConf = new SparkConf().setAppName("JavaNetworkWordCount")
            .set("spark.app.id", "MongoSparkConnectorTour").set("spark.mongodb.input.uri", UriMongo)
            .set("spark.mongodb.output.uri", UriMongo);

    JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, Durations.seconds(5));
    /** Create a JavaReceiverInputDStream on target ip:port and count the
     * words in input stream of \n delimited text (eg. generated by 'nc')
     * Note that no duplication in storage level only for running locally.
     * Replication necessary in distributed scenario for fault tolerance.
     */

    Set<String> topicsSet = new HashSet<>(Arrays.asList(topics.split(",")));
    Map<String, String> kafkaParams = new HashMap<>();
    kafkaParams.put("metadata.broker.list", brokers);

    // Create direct kafka stream with brokers and topics
    JavaPairInputDStream<String, String> messages = KafkaUtils.createDirectStream(ssc, String.class,
            String.class, StringDecoder.class, StringDecoder.class, kafkaParams, topicsSet);

    messages.print();

    JavaDStream<String> lines = messages.map(x -> x._2());

    JavaDStream<Tuple7<String, String, String, String, String, String, String>> words = lines.map(y -> {
        String[] wordy = SPACE.split(y);
        return new Tuple7<>(wordy[0], wordy[1], wordy[2], wordy[3], wordy[4], wordy[5], wordy[6]);
    });

    words.foreachRDD(rdd -> {

        List<StructField> subFields = new ArrayList<>();
        subFields.add(DataTypes.createStructField("X", DataTypes.DoubleType, true));
        subFields.add(DataTypes.createStructField("Y", DataTypes.DoubleType, true));
        subFields.add(DataTypes.createStructField("z", DataTypes.DoubleType, true));

        List<StructField> fields = new ArrayList<>();
        fields.add(DataTypes.createStructField("Serial", DataTypes.StringType, true));
        fields.add(DataTypes.createStructField("Zone", DataTypes.StringType, true));
        fields.add(DataTypes.createStructField("Group", DataTypes.StringType, true));
        fields.add(DataTypes.createStructField("coord", DataTypes.createStructType(subFields), true));
        fields.add(DataTypes.createStructField("Time", DataTypes.TimestampType, true));

        StructType schema = DataTypes.createStructType(fields);

        SparkSession spark = JavaSparkSessionSingleton.getInstance(rdd.context().getConf());

        JavaRDD<Row> rowRDD = rdd
                .map(palabra -> RowFactory.create(palabra._1(), palabra._2(), palabra._3(),
                        RowFactory.create(Double.parseDouble(palabra._4()), Double.parseDouble(palabra._5()),
                                Double.parseDouble(palabra._6())),
                        Timestamp.from(Instant.parse(palabra._7()))));

        Dataset<Row> wordsDataFrame = spark.createDataFrame(rowRDD, schema);
        wordsDataFrame.show();
        MongoSpark.write(wordsDataFrame).option("collection", "pruebaF").mode("append").save();
    });

    ssc.start();
    ssc.awaitTermination();
}

From source file:com.alpine.plugin.samples.ver1_0.JavaCountPlugin.CountPluginSparkJob.java

License:Open Source License

public DataFrame transform(OperatorParameters params, DataFrame inputDataFrame, SparkRuntimeUtils sparkUtils,
        OperatorListener listener) {/*from  w  ww .  jav  a  2 s.co m*/
    String groupByVar = params.getTabularDatasetSelectedColumn(GroupByParamKey)._2();
    listener.notifyMessage("Starting the DataFrame Transformation");
    DataFrame selectedData = inputDataFrame.select(groupByVar);
    DataFrame df = selectedData.groupBy(groupByVar).count();

    //customize the output schema
    List<StructField> fields = new ArrayList<StructField>();
    fields.add(DataTypes.createStructField(groupByVar, DataTypes.StringType, true));
    fields.add(DataTypes.createStructField("GroupCount", DataTypes.LongType, true));
    StructType dfSchema = DataTypes.createStructType(fields);
    return inputDataFrame.sqlContext().createDataFrame(df.rdd(), dfSchema);
}

From source file:com.andado.spark.examples.ml.JavaCountVectorizerExample.java

License:Apache License

public static void main(String[] args) {
    SparkSession spark = SparkSession.builder().appName("JavaCountVectorizerExample").getOrCreate();

    // $example on$
    // Input data: Each row is a bag of words from a sentence or document.
    List<Row> data = Arrays.asList(RowFactory.create(Arrays.asList("a", "b", "c")),
            RowFactory.create(Arrays.asList("a", "b", "b", "c", "a")));
    StructType schema = new StructType(new StructField[] {
            new StructField("text", new ArrayType(DataTypes.StringType, true), false, Metadata.empty()) });
    Dataset<Row> df = spark.createDataFrame(data, schema);

    // fit a CountVectorizerModel from the corpus
    CountVectorizerModel cvModel = new CountVectorizer().setInputCol("text").setOutputCol("feature")
            .setVocabSize(3).setMinDF(2).fit(df);

    // alternatively, define CountVectorizerModel with a-priori vocabulary
    CountVectorizerModel cvm = new CountVectorizerModel(new String[] { "a", "b", "c" }).setInputCol("text")
            .setOutputCol("feature");

    cvModel.transform(df).show(false);/*from  w  ww  .  ja  v  a  2s .c o  m*/
    // $example off$

    spark.stop();
}

From source file:com.andado.spark.examples.ml.JavaElementwiseProductExample.java

License:Apache License

public static void main(String[] args) {
    SparkSession spark = SparkSession.builder().appName("JavaElementwiseProductExample").getOrCreate();

    // $example on$
    // Create some vector data; also works for sparse vectors
    List<Row> data = Arrays.asList(RowFactory.create("a", Vectors.dense(1.0, 2.0, 3.0)),
            RowFactory.create("b", Vectors.dense(4.0, 5.0, 6.0)));

    List<StructField> fields = new ArrayList<>(2);
    fields.add(DataTypes.createStructField("id", DataTypes.StringType, false));
    fields.add(DataTypes.createStructField("vector", new VectorUDT(), false));

    StructType schema = DataTypes.createStructType(fields);

    Dataset<Row> dataFrame = spark.createDataFrame(data, schema);

    Vector transformingVector = Vectors.dense(0.0, 1.0, 2.0);

    ElementwiseProduct transformer = new ElementwiseProduct().setScalingVec(transformingVector)
            .setInputCol("vector").setOutputCol("transformedVector");

    // Batch transform the vectors to create new column:
    transformer.transform(dataFrame).show();
    // $example off$
    spark.stop();/*  ww  w  .  j  a  v  a  2s.  c  o m*/
}

From source file:com.andado.spark.examples.ml.JavaIndexToStringExample.java

License:Apache License

public static void main(String[] args) {
    SparkSession spark = SparkSession.builder().appName("JavaIndexToStringExample").getOrCreate();

    // $example on$
    List<Row> data = Arrays.asList(RowFactory.create(0, "a"), RowFactory.create(1, "b"),
            RowFactory.create(2, "c"), RowFactory.create(3, "a"), RowFactory.create(4, "a"),
            RowFactory.create(5, "c"));
    StructType schema = new StructType(
            new StructField[] { new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
                    new StructField("category", DataTypes.StringType, false, Metadata.empty()) });
    Dataset<Row> df = spark.createDataFrame(data, schema);

    StringIndexerModel indexer = new StringIndexer().setInputCol("category").setOutputCol("categoryIndex")
            .fit(df);//from   w w w.ja v  a 2s . c om
    Dataset<Row> indexed = indexer.transform(df);

    System.out.println("Transformed string column '" + indexer.getInputCol() + "' " + "to indexed column '"
            + indexer.getOutputCol() + "'");
    indexed.show();

    StructField inputColSchema = indexed.schema().apply(indexer.getOutputCol());
    System.out.println("StringIndexer will store labels in output column metadata: "
            + Attribute.fromStructField(inputColSchema).toString() + "\n");

    IndexToString converter = new IndexToString().setInputCol("categoryIndex").setOutputCol("originalCategory");
    Dataset<Row> converted = converter.transform(indexed);

    System.out.println("Transformed indexed column '" + converter.getInputCol() + "' back to "
            + "original string column '" + converter.getOutputCol() + "' using labels in metadata");
    converted.select("id", "categoryIndex", "originalCategory").show();

    // $example off$
    spark.stop();
}

From source file:com.andado.spark.examples.ml.JavaNGramExample.java

License:Apache License

public static void main(String[] args) {
    SparkSession spark = SparkSession.builder().appName("JavaNGramExample").getOrCreate();

    // $example on$
    List<Row> data = Arrays.asList(RowFactory.create(0, Arrays.asList("Hi", "I", "heard", "about", "Spark")),
            RowFactory.create(1, Arrays.asList("I", "wish", "Java", "could", "use", "case", "classes")),
            RowFactory.create(2, Arrays.asList("Logistic", "regression", "models", "are", "neat")));

    StructType schema = new StructType(new StructField[] {
            new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), new StructField("words",
                    DataTypes.createArrayType(DataTypes.StringType), false, Metadata.empty()) });

    Dataset<Row> wordDataFrame = spark.createDataFrame(data, schema);

    NGram ngramTransformer = new NGram().setN(2).setInputCol("words").setOutputCol("ngrams");

    Dataset<Row> ngramDataFrame = ngramTransformer.transform(wordDataFrame);
    ngramDataFrame.select("ngrams").show(false);
    // $example off$

    spark.stop();/*from w  ww. jav a2 s  .  c o m*/
}

From source file:com.andado.spark.examples.ml.JavaOneHotEncoderExample.java

License:Apache License

public static void main(String[] args) {
    SparkSession spark = SparkSession.builder().appName("JavaOneHotEncoderExample").getOrCreate();

    // $example on$
    List<Row> data = Arrays.asList(RowFactory.create(0, "a"), RowFactory.create(1, "b"),
            RowFactory.create(2, "c"), RowFactory.create(3, "a"), RowFactory.create(4, "a"),
            RowFactory.create(5, "c"));

    StructType schema = new StructType(
            new StructField[] { new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
                    new StructField("category", DataTypes.StringType, false, Metadata.empty()) });

    Dataset<Row> df = spark.createDataFrame(data, schema);

    StringIndexerModel indexer = new StringIndexer().setInputCol("category").setOutputCol("categoryIndex")
            .fit(df);//from  w  w w. j  a  v  a 2 s  .c o  m
    Dataset<Row> indexed = indexer.transform(df);

    OneHotEncoder encoder = new OneHotEncoder().setInputCol("categoryIndex").setOutputCol("categoryVec");

    Dataset<Row> encoded = encoder.transform(indexed);
    encoded.show();
    // $example off$

    spark.stop();
}

From source file:com.andado.spark.examples.ml.JavaStopWordsRemoverExample.java

License:Apache License

public static void main(String[] args) {
    SparkSession spark = SparkSession.builder().appName("JavaStopWordsRemoverExample").getOrCreate();

    // $example on$
    StopWordsRemover remover = new StopWordsRemover().setInputCol("raw").setOutputCol("filtered");

    List<Row> data = Arrays.asList(RowFactory.create(Arrays.asList("I", "saw", "the", "red", "balloon")),
            RowFactory.create(Arrays.asList("Mary", "had", "a", "little", "lamb")));

    StructType schema = new StructType(new StructField[] {
            new StructField("raw", DataTypes.createArrayType(DataTypes.StringType), false, Metadata.empty()) });

    Dataset<Row> dataset = spark.createDataFrame(data, schema);
    remover.transform(dataset).show(false);
    // $example off$
    spark.stop();/*from w w  w  .ja va 2s  .com*/
}

From source file:com.andado.spark.examples.ml.JavaTfIdfExample.java

License:Apache License

public static void main(String[] args) {
    SparkSession spark = SparkSession.builder().appName("JavaTfIdfExample").getOrCreate();

    // $example on$
    List<Row> data = Arrays.asList(RowFactory.create(0.0, "Hi I heard about Spark"),
            RowFactory.create(0.0, "I wish Java could use case classes"),
            RowFactory.create(1.0, "Logistic regression models are neat"));
    StructType schema = new StructType(
            new StructField[] { new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
                    new StructField("sentence", DataTypes.StringType, false, Metadata.empty()) });
    Dataset<Row> sentenceData = spark.createDataFrame(data, schema);

    Tokenizer tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words");
    Dataset<Row> wordsData = tokenizer.transform(sentenceData);

    int numFeatures = 20;
    HashingTF hashingTF = new HashingTF().setInputCol("words").setOutputCol("rawFeatures")
            .setNumFeatures(numFeatures);

    Dataset<Row> featurizedData = hashingTF.transform(wordsData);
    // alternatively, CountVectorizer can also be used to get term frequency vectors

    IDF idf = new IDF().setInputCol("rawFeatures").setOutputCol("features");
    IDFModel idfModel = idf.fit(featurizedData);

    Dataset<Row> rescaledData = idfModel.transform(featurizedData);
    rescaledData.select("label", "features").show();
    // $example off$

    spark.stop();/*from w w w  .j a v  a 2s  .  co m*/
}