List of usage examples for org.apache.spark.sql RowFactory create
public static Row create(Object... values)
From source file:dbx.compute.spark.jobs.ml.JavaBucketizerExample.java
License:Apache License
public static void main(String[] args) { SparkSession spark = SparkSession.builder().appName("JavaBucketizerExample").getOrCreate(); // $example on$ double[] splits = { Double.NEGATIVE_INFINITY, -0.5, 0.0, 0.5, Double.POSITIVE_INFINITY }; List<Row> data = Arrays.asList(RowFactory.create(-0.5), RowFactory.create(-0.3), RowFactory.create(0.0), RowFactory.create(0.2));/*from w ww . j av a2s . c om*/ StructType schema = new StructType( new StructField[] { new StructField("features", DataTypes.DoubleType, false, Metadata.empty()) }); Dataset<Row> dataFrame = spark.createDataFrame(data, schema); Bucketizer bucketizer = new Bucketizer().setInputCol("features").setOutputCol("bucketedFeatures") .setSplits(splits); // Transform original data into its bucket index. Dataset<Row> bucketedData = bucketizer.transform(dataFrame); bucketedData.show(); // $example off$ spark.stop(); }
From source file:dbx.compute.spark.jobs.ml.JavaCountVectorizerExample.java
License:Apache License
public static void main(String[] args) { SparkSession spark = SparkSession.builder().appName("JavaCountVectorizerExample").getOrCreate(); // $example on$ // Input data: Each row is a bag of words from a sentence or document. List<Row> data = Arrays.asList(RowFactory.create(Arrays.asList("a", "b", "c")), RowFactory.create(Arrays.asList("a", "b", "b", "c", "a"))); StructType schema = new StructType(new StructField[] { new StructField("text", new ArrayType(DataTypes.StringType, true), false, Metadata.empty()) }); Dataset<Row> df = spark.createDataFrame(data, schema); // fit a CountVectorizerModel from the corpus CountVectorizerModel cvModel = new CountVectorizer().setInputCol("text").setOutputCol("feature") .setVocabSize(3).setMinDF(2).fit(df); // alternatively, define CountVectorizerModel with a-priori vocabulary CountVectorizerModel cvm = new CountVectorizerModel(new String[] { "a", "b", "c" }).setInputCol("text") .setOutputCol("feature"); cvModel.transform(df).show();/*from w ww . j ava 2 s .c om*/ // $example off$ spark.stop(); }
From source file:dbx.compute.spark.jobs.ml.JavaDCTExample.java
License:Apache License
public static void main(String[] args) { SparkSession spark = SparkSession.builder().appName("JavaDCTExample").getOrCreate(); // $example on$ List<Row> data = Arrays.asList(RowFactory.create(Vectors.dense(0.0, 1.0, -2.0, 3.0)), RowFactory.create(Vectors.dense(-1.0, 2.0, 4.0, -7.0)), RowFactory.create(Vectors.dense(14.0, -2.0, -5.0, 1.0))); StructType schema = new StructType( new StructField[] { new StructField("features", new VectorUDT(), false, Metadata.empty()), }); Dataset<Row> df = spark.createDataFrame(data, schema); DCT dct = new DCT().setInputCol("features").setOutputCol("featuresDCT").setInverse(false); Dataset<Row> dctDf = dct.transform(df); dctDf.select("featuresDCT").show(3); // $example off$ spark.stop();/*from www .j a v a2 s.c o m*/ }
From source file:dbx.compute.spark.jobs.ml.JavaPCAExample.java
License:Apache License
public static void main(String[] args) { SparkSession spark = SparkSession.builder().appName("JavaPCAExample").getOrCreate(); // $example on$ List<Row> data = Arrays.asList( RowFactory.create(Vectors.sparse(5, new int[] { 1, 3 }, new double[] { 1.0, 7.0 })), RowFactory.create(Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0)), RowFactory.create(Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0))); StructType schema = new StructType( new StructField[] { new StructField("features", new VectorUDT(), false, Metadata.empty()), }); Dataset<Row> df = spark.createDataFrame(data, schema); PCAModel pca = new PCA().setInputCol("features").setOutputCol("pcaFeatures").setK(3).fit(df); Dataset<Row> result = pca.transform(df).select("pcaFeatures"); result.show();//from ww w . jav a 2 s. c o m // $example off$ spark.stop(); }
From source file:dbx.compute.spark.jobs.ml.JavaPolynomialExpansionExample.java
License:Apache License
public static void main(String[] args) { SparkSession spark = SparkSession.builder().appName("JavaPolynomialExpansionExample").getOrCreate(); // $example on$ PolynomialExpansion polyExpansion = new PolynomialExpansion().setInputCol("features") .setOutputCol("polyFeatures").setDegree(3); List<Row> data = Arrays.asList(RowFactory.create(Vectors.dense(-2.0, 2.3)), RowFactory.create(Vectors.dense(0.0, 0.0)), RowFactory.create(Vectors.dense(0.6, -1.1))); StructType schema = new StructType( new StructField[] { new StructField("features", new VectorUDT(), false, Metadata.empty()), }); Dataset<Row> df = spark.createDataFrame(data, schema); Dataset<Row> polyDF = polyExpansion.transform(df); List<Row> rows = polyDF.select("polyFeatures").takeAsList(3); for (Row r : rows) { System.out.println(r.get(0)); }/*from w w w . j a va 2 s . c om*/ // $example off$ spark.stop(); }
From source file:dbx.compute.spark.jobs.ml.JavaStopWordsRemoverExample.java
License:Apache License
public static void main(String[] args) { SparkSession spark = SparkSession.builder().appName("JavaStopWordsRemoverExample").getOrCreate(); // $example on$ StopWordsRemover remover = new StopWordsRemover().setInputCol("raw").setOutputCol("filtered"); List<Row> data = Arrays.asList(RowFactory.create(Arrays.asList("I", "saw", "the", "red", "baloon")), RowFactory.create(Arrays.asList("Mary", "had", "a", "little", "lamb"))); StructType schema = new StructType(new StructField[] { new StructField("raw", DataTypes.createArrayType(DataTypes.StringType), false, Metadata.empty()) }); Dataset<Row> dataset = spark.createDataFrame(data, schema); remover.transform(dataset).show();/*from ww w.j ava 2s. c o m*/ // $example off$ spark.stop(); }
From source file:dbx.compute.spark.jobs.ml.JavaVectorSlicerExample.java
License:Apache License
public static void main(String[] args) { SparkSession spark = SparkSession.builder().appName("JavaVectorSlicerExample").getOrCreate(); // $example on$ Attribute[] attrs = new Attribute[] { NumericAttribute.defaultAttr().withName("f1"), NumericAttribute.defaultAttr().withName("f2"), NumericAttribute.defaultAttr().withName("f3") }; AttributeGroup group = new AttributeGroup("userFeatures", attrs); List<Row> data = Lists.newArrayList( RowFactory.create(Vectors.sparse(3, new int[] { 0, 1 }, new double[] { -2.0, 2.3 })), RowFactory.create(Vectors.dense(-2.0, 2.3, 0.0))); Dataset<Row> dataset = spark.createDataFrame(data, (new StructType()).add(group.toStructField())); VectorSlicer vectorSlicer = new VectorSlicer().setInputCol("userFeatures").setOutputCol("features"); vectorSlicer.setIndices(new int[] { 1 }).setNames(new String[] { "f3" }); // or slicer.setIndices(new int[]{1, 2}), or slicer.setNames(new String[]{"f2", "f3"}) Dataset<Row> output = vectorSlicer.transform(dataset); System.out.println(output.select("userFeatures", "features").first()); // $example off$ spark.stop();/*w ww. jav a 2 s. c om*/ }
From source file:dbx.compute.spark.jobs.ml.JavaWord2VecExample.java
License:Apache License
public static void main(String[] args) { SparkSession spark = SparkSession.builder().appName("JavaWord2VecExample").getOrCreate(); // $example on$ // Input data: Each row is a bag of words from a sentence or document. List<Row> data = Arrays.asList(RowFactory.create(Arrays.asList("Hi I heard about Spark".split(" "))), RowFactory.create(Arrays.asList("I wish Java could use case classes".split(" "))), RowFactory.create(Arrays.asList("Logistic regression models are neat".split(" ")))); StructType schema = new StructType(new StructField[] { new StructField("text", new ArrayType(DataTypes.StringType, true), false, Metadata.empty()) }); Dataset<Row> documentDF = spark.createDataFrame(data, schema); // Learn a mapping from words to Vectors. Word2Vec word2Vec = new Word2Vec().setInputCol("text").setOutputCol("result").setVectorSize(3) .setMinCount(0);//w w w . j av a 2 s . c om Word2VecModel model = word2Vec.fit(documentDF); Dataset<Row> result = model.transform(documentDF); for (Row r : result.select("result").takeAsList(3)) { System.out.println(r); } // $example off$ spark.stop(); }
From source file:fire.nodes.dataset.NodeDatasetFileOrDirectoryCSV.java
License:Apache License
@Override public void execute(JavaSparkContext ctx, SQLContext sqlContext, WorkflowContext workflowContext, DataFrame df) {//from ww w .j a va 2s. c o m workflowContext.out("Executing NodeDatasetFileOrDirectoryCSV : " + id); // Load a text file and convert each line to a JavaBean. JavaRDD<String> people = ctx.textFile(path); // filter the header row if (filterLinesContaining != null) { people = people.filter(new Function<String, Boolean>() { @Override public Boolean call(String s) throws Exception { if (s.contains(filterLinesContaining)) return false; return true; } }); } // get schema final StructType schema = getSparkSQLSchema(); // Convert records of the RDD (people) to Rows. JavaRDD<Row> rowRDD = people.flatMap(new FlatMapFunction<String, Row>() { @Override public Iterable<Row> call(String record) throws Exception { List<Row> ll = new LinkedList<Row>(); String[] fields = record.split(separator); // skip invalid records if (fields.length != schema.length()) return ll; Object f[] = new Object[fields.length]; int idx = 0; for (String field : fields) { f[idx] = parseField(fields[idx], schema.fields()[idx]); idx++; } Row row = RowFactory.create(f); ll.add(row); return ll; } }); // Apply the schema to the RDD. DataFrame tdf = sqlContext.createDataFrame(rowRDD, schema); super.execute(ctx, sqlContext, workflowContext, tdf); }
From source file:gtl.spark.java.example.apache.ml.JavaCorrelationExample.java
License:Apache License
public static void main(String[] args) { SparkSession spark = SparkSession.builder().appName("JavaCorrelationExample").getOrCreate(); // $example on$ List<Row> data = Arrays.asList( RowFactory.create(Vectors.sparse(4, new int[] { 0, 3 }, new double[] { 1.0, -2.0 })), RowFactory.create(Vectors.dense(4.0, 5.0, 0.0, 3.0)), RowFactory.create(Vectors.dense(6.0, 7.0, 0.0, 8.0)), RowFactory.create(Vectors.sparse(4, new int[] { 0, 3 }, new double[] { 9.0, 1.0 }))); StructType schema = new StructType( new StructField[] { new StructField("features", new VectorUDT(), false, Metadata.empty()), }); Dataset<Row> df = spark.createDataFrame(data, schema); Row r1 = Correlation.corr(df, "features").head(); System.out.println("Pearson correlation matrix:\n" + r1.get(0).toString()); Row r2 = Correlation.corr(df, "features", "spearman").head(); System.out.println("Spearman correlation matrix:\n" + r2.get(0).toString()); // $example off$ spark.stop();/*from w w w .j av a 2 s. c o m*/ }