List of usage examples for org.apache.spark.sql RowFactory create
public static Row create(Object... values)
From source file:com.estonteco.spark.frames.conf.factory.creator.impl.TextFileFrameCreator.java
public IDataFrame create(JavaSparkContext javaSparkContext, SQLContext context, DefaultFrameConf configuration) { Map<String, String> properties = configuration.getProperties(); String url = getValue(properties, "URL", ""); JavaRDD<String> textFile = javaSparkContext.textFile(url); final StructType schema = configuration.getSchema(); final String delimiter = getValue(properties, "delimiter", ","); final boolean readHeader = Boolean.valueOf(getValue(properties, "header", "false")); JavaRDD<Row> rows = textFile.map(new Function<String, Row>() { long rowIndex = 0; public Row call(String record) throws Exception { Object[] cells = new Object[schema.size()]; if (!readHeader && rowIndex++ == 0) { return RowFactory.create(cells); }//from ww w. ja v a 2 s . c o m String[] parts = record.split(delimiter); int i = 0; for (StructField sf : schema.fields()) { String cell = parts[i]; if (sf.nullable() && (cell == null || cell.isEmpty())) { cells[i] = null; } else { cells[i] = cast(cell, sf.dataType()); } i++; } return RowFactory.create(cells); } }); DataFrame table = context.createDataFrame(rows, schema); table.registerTempTable(configuration.getName()); if (configuration.isCache()) { table.cache(); } return new DefaultDataFrame(table, configuration, State.INIT); }
From source file:com.sdw.dream.spark.examples.ml.JavaBucketizerExample.java
License:Apache License
public static void main(String[] args) { SparkConf conf = new SparkConf().setAppName("JavaBucketizerExample"); JavaSparkContext jsc = new JavaSparkContext(conf); SQLContext jsql = new SQLContext(jsc); // $example on$ double[] splits = { Double.NEGATIVE_INFINITY, -0.5, 0.0, 0.5, Double.POSITIVE_INFINITY }; JavaRDD<Row> data = jsc.parallelize(Arrays.asList(RowFactory.create(-0.5), RowFactory.create(-0.3), RowFactory.create(0.0), RowFactory.create(0.2))); StructType schema = new StructType( new StructField[] { new StructField("features", DataTypes.DoubleType, false, Metadata.empty()) }); DataFrame dataFrame = jsql.createDataFrame(data, schema); Bucketizer bucketizer = new Bucketizer().setInputCol("features").setOutputCol("bucketedFeatures") .setSplits(splits);/*from w w w .ja va2 s . c o m*/ // Transform original data into its bucket index. DataFrame bucketedData = bucketizer.transform(dataFrame); bucketedData.show(); // $example off$ jsc.stop(); }
From source file:com.sdw.dream.spark.examples.ml.JavaCountVectorizerExample.java
License:Apache License
public static void main(String[] args) { SparkConf conf = new SparkConf().setAppName("JavaCountVectorizerExample"); JavaSparkContext jsc = new JavaSparkContext(conf); SQLContext sqlContext = new SQLContext(jsc); // $example on$ // Input data: Each row is a bag of words from a sentence or document. JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(RowFactory.create(Arrays.asList("a", "b", "c")), RowFactory.create(Arrays.asList("a", "b", "b", "c", "a")))); StructType schema = new StructType(new StructField[] { new StructField("text", new ArrayType(DataTypes.StringType, true), false, Metadata.empty()) }); DataFrame df = sqlContext.createDataFrame(jrdd, schema); // fit a CountVectorizerModel from the corpus CountVectorizerModel cvModel = new CountVectorizer().setInputCol("text").setOutputCol("feature") .setVocabSize(3).setMinDF(2).fit(df); // alternatively, define CountVectorizerModel with a-priori vocabulary CountVectorizerModel cvm = new CountVectorizerModel(new String[] { "a", "b", "c" }).setInputCol("text") .setOutputCol("feature"); cvModel.transform(df).show();//from ww w. j ava 2 s . co m // $example off$ }
From source file:com.sdw.dream.spark.examples.ml.JavaDCTExample.java
License:Apache License
public static void main(String[] args) { SparkConf conf = new SparkConf().setAppName("JavaDCTExample"); JavaSparkContext jsc = new JavaSparkContext(conf); SQLContext jsql = new SQLContext(jsc); // $example on$ JavaRDD<Row> data = jsc.parallelize(Arrays.asList(RowFactory.create(Vectors.dense(0.0, 1.0, -2.0, 3.0)), RowFactory.create(Vectors.dense(-1.0, 2.0, 4.0, -7.0)), RowFactory.create(Vectors.dense(14.0, -2.0, -5.0, 1.0)))); StructType schema = new StructType( new StructField[] { new StructField("features", new VectorUDT(), false, Metadata.empty()), }); DataFrame df = jsql.createDataFrame(data, schema); DCT dct = new DCT().setInputCol("features").setOutputCol("featuresDCT").setInverse(false); DataFrame dctDf = dct.transform(df); dctDf.select("featuresDCT").show(3); // $example off$ jsc.stop();/*from ww w . j a va2s.c o m*/ }
From source file:com.sdw.dream.spark.examples.ml.JavaPCAExample.java
License:Apache License
public static void main(String[] args) { SparkConf conf = new SparkConf().setAppName("JavaPCAExample"); JavaSparkContext jsc = new JavaSparkContext(conf); SQLContext jsql = new SQLContext(jsc); // $example on$ JavaRDD<Row> data = jsc.parallelize( Arrays.asList(RowFactory.create(Vectors.sparse(5, new int[] { 1, 3 }, new double[] { 1.0, 7.0 })), RowFactory.create(Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0)), RowFactory.create(Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)))); StructType schema = new StructType( new StructField[] { new StructField("features", new VectorUDT(), false, Metadata.empty()), }); DataFrame df = jsql.createDataFrame(data, schema); PCAModel pca = new PCA().setInputCol("features").setOutputCol("pcaFeatures").setK(3).fit(df); DataFrame result = pca.transform(df).select("pcaFeatures"); result.show();//w w w . j a v a 2s . c o m // $example off$ jsc.stop(); }
From source file:com.sdw.dream.spark.examples.ml.JavaPolynomialExpansionExample.java
License:Apache License
public static void main(String[] args) { SparkConf conf = new SparkConf().setAppName("JavaPolynomialExpansionExample"); JavaSparkContext jsc = new JavaSparkContext(conf); SQLContext jsql = new SQLContext(jsc); // $example on$ PolynomialExpansion polyExpansion = new PolynomialExpansion().setInputCol("features") .setOutputCol("polyFeatures").setDegree(3); JavaRDD<Row> data = jsc.parallelize(Arrays.asList(RowFactory.create(Vectors.dense(-2.0, 2.3)), RowFactory.create(Vectors.dense(0.0, 0.0)), RowFactory.create(Vectors.dense(0.6, -1.1)))); StructType schema = new StructType( new StructField[] { new StructField("features", new VectorUDT(), false, Metadata.empty()), }); DataFrame df = jsql.createDataFrame(data, schema); DataFrame polyDF = polyExpansion.transform(df); Row[] row = polyDF.select("polyFeatures").take(3); for (Row r : row) { System.out.println(r.get(0)); }/*from w ww.j a v a2s .c o m*/ // $example off$ jsc.stop(); }
From source file:com.sdw.dream.spark.examples.ml.JavaStopWordsRemoverExample.java
License:Apache License
public static void main(String[] args) { SparkConf conf = new SparkConf().setAppName("JavaStopWordsRemoverExample"); JavaSparkContext jsc = new JavaSparkContext(conf); SQLContext jsql = new SQLContext(jsc); // $example on$ StopWordsRemover remover = new StopWordsRemover().setInputCol("raw").setOutputCol("filtered"); JavaRDD<Row> rdd = jsc/* w w w.java 2 s.c o m*/ .parallelize(Arrays.asList(RowFactory.create(Arrays.asList("I", "saw", "the", "red", "baloon")), RowFactory.create(Arrays.asList("Mary", "had", "a", "little", "lamb")))); StructType schema = new StructType(new StructField[] { new StructField("raw", DataTypes.createArrayType(DataTypes.StringType), false, Metadata.empty()) }); DataFrame dataset = jsql.createDataFrame(rdd, schema); remover.transform(dataset).show(); // $example off$ jsc.stop(); }
From source file:com.sdw.dream.spark.examples.ml.JavaVectorSlicerExample.java
License:Apache License
public static void main(String[] args) { SparkConf conf = new SparkConf().setAppName("JavaVectorSlicerExample"); JavaSparkContext jsc = new JavaSparkContext(conf); SQLContext jsql = new SQLContext(jsc); // $example on$ Attribute[] attrs = new Attribute[] { NumericAttribute.defaultAttr().withName("f1"), NumericAttribute.defaultAttr().withName("f2"), NumericAttribute.defaultAttr().withName("f3") }; AttributeGroup group = new AttributeGroup("userFeatures", attrs); JavaRDD<Row> jrdd = jsc.parallelize(Lists.newArrayList( RowFactory.create(Vectors.sparse(3, new int[] { 0, 1 }, new double[] { -2.0, 2.3 })), RowFactory.create(Vectors.dense(-2.0, 2.3, 0.0)))); DataFrame dataset = jsql.createDataFrame(jrdd, (new StructType()).add(group.toStructField())); VectorSlicer vectorSlicer = new VectorSlicer().setInputCol("userFeatures").setOutputCol("features"); vectorSlicer.setIndices(new int[] { 1 }).setNames(new String[] { "f3" }); // or slicer.setIndices(new int[]{1, 2}), or slicer.setNames(new String[]{"f2", "f3"}) DataFrame output = vectorSlicer.transform(dataset); System.out.println(output.select("userFeatures", "features").first()); // $example off$ jsc.stop();/*from w w w . ja v a 2s . co m*/ }
From source file:com.thinkbiganalytics.spark.datavalidator.functions.CleanseAndValidateRow.java
License:Apache License
@Override public CleansedRowResult call(@Nonnull final Row row) throws Exception { /*//from www . ja v a 2 s.c o m Cache for performance. Validators accept different parameters (numeric,string, etc) so we need to resolve the type using reflection */ Map<Class, Class> validatorParamType = new HashMap<>(); int nulls = hasProcessingDttm ? 1 : 0; // Create placeholder for the new values plus one columns for reject_reason Object[] newValues = new Object[dataTypes.length + 1]; boolean rowValid = true; String sbRejectReason; List<ValidationResult> results = null; boolean[] columnsValid = new boolean[dataTypes.length]; Map<Integer, Object> originalValues = new HashMap<>(); // Iterate through columns to cleanse and validate for (int idx = 0; idx < dataTypes.length; idx++) { ValidationResult result; FieldPolicy fieldPolicy = policies[idx]; HCatDataType dataType = dataTypes[idx]; boolean columnValid = true; boolean isBinaryType = dataType.getConvertibleType().equals(byte[].class); // Extract the value (allowing for null or missing field for odd-ball data) Object val = (idx == row.length() || row.isNullAt(idx) ? null : row.get(idx)); // Handle complex types by passing them through if (dataType.isUnchecked()) { if (val == null) { nulls++; } newValues[idx] = val; originalValues.put(idx, val); } else { Object fieldValue = (val); boolean isEmpty; if (fieldValue == null) { nulls++; } originalValues.put(idx, fieldValue); StandardizationAndValidationResult standardizationAndValidationResult = standardizeAndValidateField( fieldPolicy, fieldValue, dataType, validatorParamType); result = standardizationAndValidationResult.getFinalValidationResult(); //only apply the standardized result value if the routine is valid fieldValue = result.isValid() ? standardizationAndValidationResult.getFieldValue() : fieldValue; //reevaluate the isEmpty flag isEmpty = ((fieldValue == null) || (StringUtils.isEmpty(fieldValue.toString()))); //if the field is a binary type, but cant be converted set it to null. //hive will auto convert byte[] or String fields to a target binary type. if (result.isValid() && isBinaryType && !(fieldValue instanceof byte[]) && !(fieldValue instanceof String)) { //set it to null fieldValue = null; } else if ((dataType.isNumeric() || isBinaryType) && isEmpty) { //if its a numeric column and the field is empty then set it to null as well fieldValue = null; } newValues[idx] = fieldValue; if (!result.isValid()) { rowValid = false; results = (results == null ? new Vector<ValidationResult>() : results); results.addAll(standardizationAndValidationResult.getValidationResults()); columnValid = false; } } // Record fact that we there was an invalid column columnsValid[idx] = columnValid; } // Return success unless all values were null. That would indicate a blank line in the file. if (nulls >= dataTypes.length) { rowValid = false; results = (results == null ? new Vector<ValidationResult>() : results); results.add(ValidationResult.failRow("empty", "Row is empty")); } if (!rowValid) { for (int idx = 0; idx < dataTypes.length; idx++) { //if the value is not able to match the invalid dataTypes and the datatype has changed then replace with original value //the _invalid table dataTypes matches the source, not the destination if (newValues[idx] == null || originalValues.get(idx) == null || newValues[idx].getClass() != originalValues.get(idx).getClass()) { newValues[idx] = originalValues.get(idx); } //otherwise the data has changed, but its still the same data type so we can keep the newly changed value } } // Convert to reject reasons to JSON sbRejectReason = toJSONArray(results); // Record the results in the appended columns, move processing partition value last if (hasProcessingDttm) { newValues[dataTypes.length] = newValues[dataTypes.length - 1]; //PROCESSING_DTTM_COL newValues[dataTypes.length - 1] = sbRejectReason; //REJECT_REASON_COL } else { newValues[dataTypes.length] = sbRejectReason; } return new CleansedRowResult(RowFactory.create(newValues), columnsValid, rowValid); }
From source file:com.thinkbiganalytics.spark.jdbc.RowTransform.java
License:Apache License
/** * Converts the specified JDBC ResultSet into a Spark SQL Row. * * @param rs the result set/*from www . j a v a2 s .c o m*/ * @return the Spark SQL row * @throws SQLException if a SQL error occurs */ @Nonnull private Row mapRow(@Nonnull final ResultSet rs) throws SQLException { final Converter[] converters = getConverters(rs); final int columnCount = converters.length; final Object[] values = new Object[columnCount]; for (int i = 0; i < columnCount; ++i) { values[i] = converters[i].convert(rs, i + 1); } return RowFactory.create(values); }