List of usage examples for org.apache.spark.sql.types DataTypes StringType
DataType StringType
To view the source code for org.apache.spark.sql.types DataTypes StringType.
Click Source Link
From source file:JavaWord2VecExample.java
License:Apache License
public static void main(String[] args) { SparkConf conf = new SparkConf().setAppName("JavaWord2VecExample"); JavaSparkContext jsc = new JavaSparkContext(conf); SQLContext sqlContext = new SQLContext(jsc); // $example on$ // Input data: Each row is a bag of words from a sentence or document. JavaRDD<Row> jrdd = jsc// w w w. j a v a2s. c o m .parallelize(Arrays.asList(RowFactory.create(Arrays.asList("Hi I heard about Spark".split(" "))), RowFactory.create(Arrays.asList("I wish Java could use case classes".split(" "))), RowFactory.create(Arrays.asList("Logistic regression models are neat".split(" "))))); StructType schema = new StructType(new StructField[] { new StructField("text", new ArrayType(DataTypes.StringType, true), false, Metadata.empty()) }); DataFrame documentDF = sqlContext.createDataFrame(jrdd, schema); // Learn a mapping from words to Vectors. Word2Vec word2Vec = new Word2Vec().setInputCol("text").setOutputCol("result").setVectorSize(3) .setMinCount(0); Word2VecModel model = word2Vec.fit(documentDF); DataFrame result = model.transform(documentDF); for (Row r : result.select("result").take(3)) { System.out.println(r); } // $example off$ }
From source file:KafkaSparkMongo.java
License:Apache License
public static void main(String[] args) throws Exception { if (args.length < 2) { System.err.println("Usage: JavaDirectKafkaWordCount <brokers> <topics>\n" + " <brokers> is a list of one or more Kafka brokers\n" + " <topics> is a list of one or more kafka topics to consume from\n\n"); System.exit(1);/* w w w . j ava 2s .co m*/ } String brokers = args[0]; String topics = args[1]; String UriMongo = "mongodb://localhost/streamSparkFinal.coll"; dropDatabase(UriMongo); // Create the context with a 1 second batch size SparkConf sparkConf = new SparkConf().setAppName("JavaNetworkWordCount") .set("spark.app.id", "MongoSparkConnectorTour").set("spark.mongodb.input.uri", UriMongo) .set("spark.mongodb.output.uri", UriMongo); JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, Durations.seconds(5)); /** Create a JavaReceiverInputDStream on target ip:port and count the * words in input stream of \n delimited text (eg. generated by 'nc') * Note that no duplication in storage level only for running locally. * Replication necessary in distributed scenario for fault tolerance. */ Set<String> topicsSet = new HashSet<>(Arrays.asList(topics.split(","))); Map<String, String> kafkaParams = new HashMap<>(); kafkaParams.put("metadata.broker.list", brokers); // Create direct kafka stream with brokers and topics JavaPairInputDStream<String, String> messages = KafkaUtils.createDirectStream(ssc, String.class, String.class, StringDecoder.class, StringDecoder.class, kafkaParams, topicsSet); messages.print(); JavaDStream<String> lines = messages.map(x -> x._2()); JavaDStream<Tuple7<String, String, String, String, String, String, String>> words = lines.map(y -> { String[] wordy = SPACE.split(y); return new Tuple7<>(wordy[0], wordy[1], wordy[2], wordy[3], wordy[4], wordy[5], wordy[6]); }); words.foreachRDD(rdd -> { List<StructField> subFields = new ArrayList<>(); subFields.add(DataTypes.createStructField("X", DataTypes.DoubleType, true)); subFields.add(DataTypes.createStructField("Y", DataTypes.DoubleType, true)); subFields.add(DataTypes.createStructField("z", DataTypes.DoubleType, true)); List<StructField> fields = new ArrayList<>(); fields.add(DataTypes.createStructField("Serial", DataTypes.StringType, true)); fields.add(DataTypes.createStructField("Zone", DataTypes.StringType, true)); fields.add(DataTypes.createStructField("Group", DataTypes.StringType, true)); fields.add(DataTypes.createStructField("coord", DataTypes.createStructType(subFields), true)); fields.add(DataTypes.createStructField("Time", DataTypes.TimestampType, true)); StructType schema = DataTypes.createStructType(fields); SparkSession spark = JavaSparkSessionSingleton.getInstance(rdd.context().getConf()); JavaRDD<Row> rowRDD = rdd .map(palabra -> RowFactory.create(palabra._1(), palabra._2(), palabra._3(), RowFactory.create(Double.parseDouble(palabra._4()), Double.parseDouble(palabra._5()), Double.parseDouble(palabra._6())), Timestamp.from(Instant.parse(palabra._7())))); Dataset<Row> wordsDataFrame = spark.createDataFrame(rowRDD, schema); wordsDataFrame.show(); MongoSpark.write(wordsDataFrame).option("collection", "pruebaF").mode("append").save(); }); ssc.start(); ssc.awaitTermination(); }
From source file:com.alpine.plugin.samples.ver1_0.JavaCountPlugin.CountPluginSparkJob.java
License:Open Source License
public DataFrame transform(OperatorParameters params, DataFrame inputDataFrame, SparkRuntimeUtils sparkUtils, OperatorListener listener) {/*from w ww . jav a 2 s.co m*/ String groupByVar = params.getTabularDatasetSelectedColumn(GroupByParamKey)._2(); listener.notifyMessage("Starting the DataFrame Transformation"); DataFrame selectedData = inputDataFrame.select(groupByVar); DataFrame df = selectedData.groupBy(groupByVar).count(); //customize the output schema List<StructField> fields = new ArrayList<StructField>(); fields.add(DataTypes.createStructField(groupByVar, DataTypes.StringType, true)); fields.add(DataTypes.createStructField("GroupCount", DataTypes.LongType, true)); StructType dfSchema = DataTypes.createStructType(fields); return inputDataFrame.sqlContext().createDataFrame(df.rdd(), dfSchema); }
From source file:com.andado.spark.examples.ml.JavaCountVectorizerExample.java
License:Apache License
public static void main(String[] args) { SparkSession spark = SparkSession.builder().appName("JavaCountVectorizerExample").getOrCreate(); // $example on$ // Input data: Each row is a bag of words from a sentence or document. List<Row> data = Arrays.asList(RowFactory.create(Arrays.asList("a", "b", "c")), RowFactory.create(Arrays.asList("a", "b", "b", "c", "a"))); StructType schema = new StructType(new StructField[] { new StructField("text", new ArrayType(DataTypes.StringType, true), false, Metadata.empty()) }); Dataset<Row> df = spark.createDataFrame(data, schema); // fit a CountVectorizerModel from the corpus CountVectorizerModel cvModel = new CountVectorizer().setInputCol("text").setOutputCol("feature") .setVocabSize(3).setMinDF(2).fit(df); // alternatively, define CountVectorizerModel with a-priori vocabulary CountVectorizerModel cvm = new CountVectorizerModel(new String[] { "a", "b", "c" }).setInputCol("text") .setOutputCol("feature"); cvModel.transform(df).show(false);/*from w ww . ja v a 2s .c o m*/ // $example off$ spark.stop(); }
From source file:com.andado.spark.examples.ml.JavaElementwiseProductExample.java
License:Apache License
public static void main(String[] args) { SparkSession spark = SparkSession.builder().appName("JavaElementwiseProductExample").getOrCreate(); // $example on$ // Create some vector data; also works for sparse vectors List<Row> data = Arrays.asList(RowFactory.create("a", Vectors.dense(1.0, 2.0, 3.0)), RowFactory.create("b", Vectors.dense(4.0, 5.0, 6.0))); List<StructField> fields = new ArrayList<>(2); fields.add(DataTypes.createStructField("id", DataTypes.StringType, false)); fields.add(DataTypes.createStructField("vector", new VectorUDT(), false)); StructType schema = DataTypes.createStructType(fields); Dataset<Row> dataFrame = spark.createDataFrame(data, schema); Vector transformingVector = Vectors.dense(0.0, 1.0, 2.0); ElementwiseProduct transformer = new ElementwiseProduct().setScalingVec(transformingVector) .setInputCol("vector").setOutputCol("transformedVector"); // Batch transform the vectors to create new column: transformer.transform(dataFrame).show(); // $example off$ spark.stop();/* ww w . j a v a 2s. c o m*/ }
From source file:com.andado.spark.examples.ml.JavaIndexToStringExample.java
License:Apache License
public static void main(String[] args) { SparkSession spark = SparkSession.builder().appName("JavaIndexToStringExample").getOrCreate(); // $example on$ List<Row> data = Arrays.asList(RowFactory.create(0, "a"), RowFactory.create(1, "b"), RowFactory.create(2, "c"), RowFactory.create(3, "a"), RowFactory.create(4, "a"), RowFactory.create(5, "c")); StructType schema = new StructType( new StructField[] { new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), new StructField("category", DataTypes.StringType, false, Metadata.empty()) }); Dataset<Row> df = spark.createDataFrame(data, schema); StringIndexerModel indexer = new StringIndexer().setInputCol("category").setOutputCol("categoryIndex") .fit(df);//from w w w.ja v a 2s . c om Dataset<Row> indexed = indexer.transform(df); System.out.println("Transformed string column '" + indexer.getInputCol() + "' " + "to indexed column '" + indexer.getOutputCol() + "'"); indexed.show(); StructField inputColSchema = indexed.schema().apply(indexer.getOutputCol()); System.out.println("StringIndexer will store labels in output column metadata: " + Attribute.fromStructField(inputColSchema).toString() + "\n"); IndexToString converter = new IndexToString().setInputCol("categoryIndex").setOutputCol("originalCategory"); Dataset<Row> converted = converter.transform(indexed); System.out.println("Transformed indexed column '" + converter.getInputCol() + "' back to " + "original string column '" + converter.getOutputCol() + "' using labels in metadata"); converted.select("id", "categoryIndex", "originalCategory").show(); // $example off$ spark.stop(); }
From source file:com.andado.spark.examples.ml.JavaNGramExample.java
License:Apache License
public static void main(String[] args) { SparkSession spark = SparkSession.builder().appName("JavaNGramExample").getOrCreate(); // $example on$ List<Row> data = Arrays.asList(RowFactory.create(0, Arrays.asList("Hi", "I", "heard", "about", "Spark")), RowFactory.create(1, Arrays.asList("I", "wish", "Java", "could", "use", "case", "classes")), RowFactory.create(2, Arrays.asList("Logistic", "regression", "models", "are", "neat"))); StructType schema = new StructType(new StructField[] { new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), new StructField("words", DataTypes.createArrayType(DataTypes.StringType), false, Metadata.empty()) }); Dataset<Row> wordDataFrame = spark.createDataFrame(data, schema); NGram ngramTransformer = new NGram().setN(2).setInputCol("words").setOutputCol("ngrams"); Dataset<Row> ngramDataFrame = ngramTransformer.transform(wordDataFrame); ngramDataFrame.select("ngrams").show(false); // $example off$ spark.stop();/*from w ww. jav a2 s . c o m*/ }
From source file:com.andado.spark.examples.ml.JavaOneHotEncoderExample.java
License:Apache License
public static void main(String[] args) { SparkSession spark = SparkSession.builder().appName("JavaOneHotEncoderExample").getOrCreate(); // $example on$ List<Row> data = Arrays.asList(RowFactory.create(0, "a"), RowFactory.create(1, "b"), RowFactory.create(2, "c"), RowFactory.create(3, "a"), RowFactory.create(4, "a"), RowFactory.create(5, "c")); StructType schema = new StructType( new StructField[] { new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), new StructField("category", DataTypes.StringType, false, Metadata.empty()) }); Dataset<Row> df = spark.createDataFrame(data, schema); StringIndexerModel indexer = new StringIndexer().setInputCol("category").setOutputCol("categoryIndex") .fit(df);//from w w w. j a v a 2 s .c o m Dataset<Row> indexed = indexer.transform(df); OneHotEncoder encoder = new OneHotEncoder().setInputCol("categoryIndex").setOutputCol("categoryVec"); Dataset<Row> encoded = encoder.transform(indexed); encoded.show(); // $example off$ spark.stop(); }
From source file:com.andado.spark.examples.ml.JavaStopWordsRemoverExample.java
License:Apache License
public static void main(String[] args) { SparkSession spark = SparkSession.builder().appName("JavaStopWordsRemoverExample").getOrCreate(); // $example on$ StopWordsRemover remover = new StopWordsRemover().setInputCol("raw").setOutputCol("filtered"); List<Row> data = Arrays.asList(RowFactory.create(Arrays.asList("I", "saw", "the", "red", "balloon")), RowFactory.create(Arrays.asList("Mary", "had", "a", "little", "lamb"))); StructType schema = new StructType(new StructField[] { new StructField("raw", DataTypes.createArrayType(DataTypes.StringType), false, Metadata.empty()) }); Dataset<Row> dataset = spark.createDataFrame(data, schema); remover.transform(dataset).show(false); // $example off$ spark.stop();/*from w w w .ja va 2s .com*/ }
From source file:com.andado.spark.examples.ml.JavaTfIdfExample.java
License:Apache License
public static void main(String[] args) { SparkSession spark = SparkSession.builder().appName("JavaTfIdfExample").getOrCreate(); // $example on$ List<Row> data = Arrays.asList(RowFactory.create(0.0, "Hi I heard about Spark"), RowFactory.create(0.0, "I wish Java could use case classes"), RowFactory.create(1.0, "Logistic regression models are neat")); StructType schema = new StructType( new StructField[] { new StructField("label", DataTypes.DoubleType, false, Metadata.empty()), new StructField("sentence", DataTypes.StringType, false, Metadata.empty()) }); Dataset<Row> sentenceData = spark.createDataFrame(data, schema); Tokenizer tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words"); Dataset<Row> wordsData = tokenizer.transform(sentenceData); int numFeatures = 20; HashingTF hashingTF = new HashingTF().setInputCol("words").setOutputCol("rawFeatures") .setNumFeatures(numFeatures); Dataset<Row> featurizedData = hashingTF.transform(wordsData); // alternatively, CountVectorizer can also be used to get term frequency vectors IDF idf = new IDF().setInputCol("rawFeatures").setOutputCol("features"); IDFModel idfModel = idf.fit(featurizedData); Dataset<Row> rescaledData = idfModel.transform(featurizedData); rescaledData.select("label", "features").show(); // $example off$ spark.stop();/*from w w w .j a v a 2s . co m*/ }