List of usage examples for org.apache.spark.sql RowFactory create
public static Row create(Object... values)
From source file:JavaWord2VecExample.java
License:Apache License
public static void main(String[] args) { SparkConf conf = new SparkConf().setAppName("JavaWord2VecExample"); JavaSparkContext jsc = new JavaSparkContext(conf); SQLContext sqlContext = new SQLContext(jsc); // $example on$ // Input data: Each row is a bag of words from a sentence or document. JavaRDD<Row> jrdd = jsc//from www. ja v a2 s . c om .parallelize(Arrays.asList(RowFactory.create(Arrays.asList("Hi I heard about Spark".split(" "))), RowFactory.create(Arrays.asList("I wish Java could use case classes".split(" "))), RowFactory.create(Arrays.asList("Logistic regression models are neat".split(" "))))); StructType schema = new StructType(new StructField[] { new StructField("text", new ArrayType(DataTypes.StringType, true), false, Metadata.empty()) }); DataFrame documentDF = sqlContext.createDataFrame(jrdd, schema); // Learn a mapping from words to Vectors. Word2Vec word2Vec = new Word2Vec().setInputCol("text").setOutputCol("result").setVectorSize(3) .setMinCount(0); Word2VecModel model = word2Vec.fit(documentDF); DataFrame result = model.transform(documentDF); for (Row r : result.select("result").take(3)) { System.out.println(r); } // $example off$ }
From source file:com.andado.spark.examples.ml.JavaBucketizerExample.java
License:Apache License
public static void main(String[] args) { SparkSession spark = SparkSession.builder().appName("JavaBucketizerExample").getOrCreate(); // $example on$ double[] splits = { Double.NEGATIVE_INFINITY, -0.5, 0.0, 0.5, Double.POSITIVE_INFINITY }; List<Row> data = Arrays.asList(RowFactory.create(-999.9), RowFactory.create(-0.5), RowFactory.create(-0.3), RowFactory.create(0.0), RowFactory.create(0.2), RowFactory.create(999.9)); StructType schema = new StructType( new StructField[] { new StructField("features", DataTypes.DoubleType, false, Metadata.empty()) }); Dataset<Row> dataFrame = spark.createDataFrame(data, schema); Bucketizer bucketizer = new Bucketizer().setInputCol("features").setOutputCol("bucketedFeatures") .setSplits(splits);/*from w w w. jav a 2s . com*/ // Transform original data into its bucket index. Dataset<Row> bucketedData = bucketizer.transform(dataFrame); System.out.println("Bucketizer output with " + (bucketizer.getSplits().length - 1) + " buckets"); bucketedData.show(); // $example off$ spark.stop(); }
From source file:com.andado.spark.examples.ml.JavaCountVectorizerExample.java
License:Apache License
public static void main(String[] args) { SparkSession spark = SparkSession.builder().appName("JavaCountVectorizerExample").getOrCreate(); // $example on$ // Input data: Each row is a bag of words from a sentence or document. List<Row> data = Arrays.asList(RowFactory.create(Arrays.asList("a", "b", "c")), RowFactory.create(Arrays.asList("a", "b", "b", "c", "a"))); StructType schema = new StructType(new StructField[] { new StructField("text", new ArrayType(DataTypes.StringType, true), false, Metadata.empty()) }); Dataset<Row> df = spark.createDataFrame(data, schema); // fit a CountVectorizerModel from the corpus CountVectorizerModel cvModel = new CountVectorizer().setInputCol("text").setOutputCol("feature") .setVocabSize(3).setMinDF(2).fit(df); // alternatively, define CountVectorizerModel with a-priori vocabulary CountVectorizerModel cvm = new CountVectorizerModel(new String[] { "a", "b", "c" }).setInputCol("text") .setOutputCol("feature"); cvModel.transform(df).show(false);//from ww w.ja v a2s. com // $example off$ spark.stop(); }
From source file:com.andado.spark.examples.ml.JavaDCTExample.java
License:Apache License
public static void main(String[] args) { SparkSession spark = SparkSession.builder().appName("JavaDCTExample").getOrCreate(); // $example on$ List<Row> data = Arrays.asList(RowFactory.create(Vectors.dense(0.0, 1.0, -2.0, 3.0)), RowFactory.create(Vectors.dense(-1.0, 2.0, 4.0, -7.0)), RowFactory.create(Vectors.dense(14.0, -2.0, -5.0, 1.0))); StructType schema = new StructType( new StructField[] { new StructField("features", new VectorUDT(), false, Metadata.empty()), }); Dataset<Row> df = spark.createDataFrame(data, schema); DCT dct = new DCT().setInputCol("features").setOutputCol("featuresDCT").setInverse(false); Dataset<Row> dctDf = dct.transform(df); dctDf.select("featuresDCT").show(false); // $example off$ spark.stop();//from ww w .j a va2 s .c om }
From source file:com.andado.spark.examples.ml.JavaPCAExample.java
License:Apache License
public static void main(String[] args) { SparkSession spark = SparkSession.builder().appName("JavaPCAExample").getOrCreate(); // $example on$ List<Row> data = Arrays.asList( RowFactory.create(Vectors.sparse(5, new int[] { 1, 3 }, new double[] { 1.0, 7.0 })), RowFactory.create(Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0)), RowFactory.create(Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0))); StructType schema = new StructType( new StructField[] { new StructField("features", new VectorUDT(), false, Metadata.empty()), }); Dataset<Row> df = spark.createDataFrame(data, schema); PCAModel pca = new PCA().setInputCol("features").setOutputCol("pcaFeatures").setK(3).fit(df); Dataset<Row> result = pca.transform(df).select("pcaFeatures"); result.show(false);// ww w.j a v a 2 s .co m // $example off$ spark.stop(); }
From source file:com.andado.spark.examples.ml.JavaPolynomialExpansionExample.java
License:Apache License
public static void main(String[] args) { SparkSession spark = SparkSession.builder().appName("JavaPolynomialExpansionExample").getOrCreate(); // $example on$ PolynomialExpansion polyExpansion = new PolynomialExpansion().setInputCol("features") .setOutputCol("polyFeatures").setDegree(3); List<Row> data = Arrays.asList(RowFactory.create(Vectors.dense(2.0, 1.0)), RowFactory.create(Vectors.dense(0.0, 0.0)), RowFactory.create(Vectors.dense(3.0, -1.0))); StructType schema = new StructType( new StructField[] { new StructField("features", new VectorUDT(), false, Metadata.empty()), }); Dataset<Row> df = spark.createDataFrame(data, schema); Dataset<Row> polyDF = polyExpansion.transform(df); polyDF.show(false);/*from ww w . java2 s .c o m*/ // $example off$ spark.stop(); }
From source file:com.andado.spark.examples.ml.JavaStopWordsRemoverExample.java
License:Apache License
public static void main(String[] args) { SparkSession spark = SparkSession.builder().appName("JavaStopWordsRemoverExample").getOrCreate(); // $example on$ StopWordsRemover remover = new StopWordsRemover().setInputCol("raw").setOutputCol("filtered"); List<Row> data = Arrays.asList(RowFactory.create(Arrays.asList("I", "saw", "the", "red", "balloon")), RowFactory.create(Arrays.asList("Mary", "had", "a", "little", "lamb"))); StructType schema = new StructType(new StructField[] { new StructField("raw", DataTypes.createArrayType(DataTypes.StringType), false, Metadata.empty()) }); Dataset<Row> dataset = spark.createDataFrame(data, schema); remover.transform(dataset).show(false); // $example off$ spark.stop();//from w w w . ja v a 2 s . c o m }
From source file:com.andado.spark.examples.ml.JavaVectorSlicerExample.java
License:Apache License
public static void main(String[] args) { SparkSession spark = SparkSession.builder().appName("JavaVectorSlicerExample").getOrCreate(); // $example on$ Attribute[] attrs = new Attribute[] { NumericAttribute.defaultAttr().withName("f1"), NumericAttribute.defaultAttr().withName("f2"), NumericAttribute.defaultAttr().withName("f3") }; AttributeGroup group = new AttributeGroup("userFeatures", attrs); List<Row> data = Lists.newArrayList( RowFactory.create(Vectors.sparse(3, new int[] { 0, 1 }, new double[] { -2.0, 2.3 })), RowFactory.create(Vectors.dense(-2.0, 2.3, 0.0))); Dataset<Row> dataset = spark.createDataFrame(data, (new StructType()).add(group.toStructField())); VectorSlicer vectorSlicer = new VectorSlicer().setInputCol("userFeatures").setOutputCol("features"); vectorSlicer.setIndices(new int[] { 1 }).setNames(new String[] { "f3" }); // or slicer.setIndices(new int[]{1, 2}), or slicer.setNames(new String[]{"f2", "f3"}) Dataset<Row> output = vectorSlicer.transform(dataset); output.show(false);//from w w w . j a v a 2s. c o m // $example off$ spark.stop(); }
From source file:com.andado.spark.examples.ml.JavaWord2VecExample.java
License:Apache License
public static void main(String[] args) { SparkSession spark = SparkSession.builder().appName("JavaWord2VecExample").getOrCreate(); // $example on$ // Input data: Each row is a bag of words from a sentence or document. List<Row> data = Arrays.asList(RowFactory.create(Arrays.asList("Hi I heard about Spark".split(" "))), RowFactory.create(Arrays.asList("I wish Java could use case classes".split(" "))), RowFactory.create(Arrays.asList("Logistic regression models are neat".split(" ")))); StructType schema = new StructType(new StructField[] { new StructField("text", new ArrayType(DataTypes.StringType, true), false, Metadata.empty()) }); Dataset<Row> documentDF = spark.createDataFrame(data, schema); // Learn a mapping from words to Vectors. Word2Vec word2Vec = new Word2Vec().setInputCol("text").setOutputCol("result").setVectorSize(3) .setMinCount(0);// w ww . j av a2s . com Word2VecModel model = word2Vec.fit(documentDF); Dataset<Row> result = model.transform(documentDF); for (Row row : result.collectAsList()) { List<String> text = row.getList(0); Vector vector = (Vector) row.get(1); System.out.println("Text: " + text + " => \nVector: " + vector + "\n"); } // $example off$ spark.stop(); }
From source file:com.bosscs.spark.commons.utils.CellsUtils.java
License:Apache License
/** * Creates a SparkSQL Row object from a Stratio Cells object * * @param cells Stratio Cells object for transforming. * @return SparkSQL Row created from Cells. *//*from w w w . j a va 2s . co m*/ public static Row getRowFromCells(Cells cells) { Object[] values = cells.getCellValues().toArray(); return RowFactory.create(values); }