Example usage for org.apache.spark.sql RowFactory create

List of usage examples for org.apache.spark.sql RowFactory create

Introduction

In this page you can find the example usage for org.apache.spark.sql RowFactory create.

Prototype

public static Row create(Object... values) 

Source Link

Document

Create a Row from the given arguments.

Usage

From source file:com.estonteco.spark.frames.conf.factory.creator.impl.TextFileFrameCreator.java

public IDataFrame create(JavaSparkContext javaSparkContext, SQLContext context,
        DefaultFrameConf configuration) {
    Map<String, String> properties = configuration.getProperties();
    String url = getValue(properties, "URL", "");
    JavaRDD<String> textFile = javaSparkContext.textFile(url);
    final StructType schema = configuration.getSchema();
    final String delimiter = getValue(properties, "delimiter", ",");
    final boolean readHeader = Boolean.valueOf(getValue(properties, "header", "false"));
    JavaRDD<Row> rows = textFile.map(new Function<String, Row>() {
        long rowIndex = 0;

        public Row call(String record) throws Exception {
            Object[] cells = new Object[schema.size()];
            if (!readHeader && rowIndex++ == 0) {
                return RowFactory.create(cells);
            }//from   ww  w.  ja  v  a  2 s .  c  o  m
            String[] parts = record.split(delimiter);
            int i = 0;
            for (StructField sf : schema.fields()) {
                String cell = parts[i];
                if (sf.nullable() && (cell == null || cell.isEmpty())) {
                    cells[i] = null;
                } else {
                    cells[i] = cast(cell, sf.dataType());
                }
                i++;
            }
            return RowFactory.create(cells);
        }
    });
    DataFrame table = context.createDataFrame(rows, schema);
    table.registerTempTable(configuration.getName());
    if (configuration.isCache()) {
        table.cache();
    }
    return new DefaultDataFrame(table, configuration, State.INIT);
}

From source file:com.sdw.dream.spark.examples.ml.JavaBucketizerExample.java

License:Apache License

public static void main(String[] args) {
    SparkConf conf = new SparkConf().setAppName("JavaBucketizerExample");
    JavaSparkContext jsc = new JavaSparkContext(conf);
    SQLContext jsql = new SQLContext(jsc);

    // $example on$
    double[] splits = { Double.NEGATIVE_INFINITY, -0.5, 0.0, 0.5, Double.POSITIVE_INFINITY };

    JavaRDD<Row> data = jsc.parallelize(Arrays.asList(RowFactory.create(-0.5), RowFactory.create(-0.3),
            RowFactory.create(0.0), RowFactory.create(0.2)));
    StructType schema = new StructType(
            new StructField[] { new StructField("features", DataTypes.DoubleType, false, Metadata.empty()) });
    DataFrame dataFrame = jsql.createDataFrame(data, schema);

    Bucketizer bucketizer = new Bucketizer().setInputCol("features").setOutputCol("bucketedFeatures")
            .setSplits(splits);/*from  w  w w  .ja  va2 s . c  o m*/

    // Transform original data into its bucket index.
    DataFrame bucketedData = bucketizer.transform(dataFrame);
    bucketedData.show();
    // $example off$
    jsc.stop();
}

From source file:com.sdw.dream.spark.examples.ml.JavaCountVectorizerExample.java

License:Apache License

public static void main(String[] args) {

    SparkConf conf = new SparkConf().setAppName("JavaCountVectorizerExample");
    JavaSparkContext jsc = new JavaSparkContext(conf);
    SQLContext sqlContext = new SQLContext(jsc);

    // $example on$
    // Input data: Each row is a bag of words from a sentence or document.
    JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(RowFactory.create(Arrays.asList("a", "b", "c")),
            RowFactory.create(Arrays.asList("a", "b", "b", "c", "a"))));
    StructType schema = new StructType(new StructField[] {
            new StructField("text", new ArrayType(DataTypes.StringType, true), false, Metadata.empty()) });
    DataFrame df = sqlContext.createDataFrame(jrdd, schema);

    // fit a CountVectorizerModel from the corpus
    CountVectorizerModel cvModel = new CountVectorizer().setInputCol("text").setOutputCol("feature")
            .setVocabSize(3).setMinDF(2).fit(df);

    // alternatively, define CountVectorizerModel with a-priori vocabulary
    CountVectorizerModel cvm = new CountVectorizerModel(new String[] { "a", "b", "c" }).setInputCol("text")
            .setOutputCol("feature");

    cvModel.transform(df).show();//from   ww w.  j  ava 2  s .  co  m
    // $example off$
}

From source file:com.sdw.dream.spark.examples.ml.JavaDCTExample.java

License:Apache License

public static void main(String[] args) {
    SparkConf conf = new SparkConf().setAppName("JavaDCTExample");
    JavaSparkContext jsc = new JavaSparkContext(conf);
    SQLContext jsql = new SQLContext(jsc);

    // $example on$
    JavaRDD<Row> data = jsc.parallelize(Arrays.asList(RowFactory.create(Vectors.dense(0.0, 1.0, -2.0, 3.0)),
            RowFactory.create(Vectors.dense(-1.0, 2.0, 4.0, -7.0)),
            RowFactory.create(Vectors.dense(14.0, -2.0, -5.0, 1.0))));
    StructType schema = new StructType(
            new StructField[] { new StructField("features", new VectorUDT(), false, Metadata.empty()), });
    DataFrame df = jsql.createDataFrame(data, schema);
    DCT dct = new DCT().setInputCol("features").setOutputCol("featuresDCT").setInverse(false);
    DataFrame dctDf = dct.transform(df);
    dctDf.select("featuresDCT").show(3);
    // $example off$
    jsc.stop();/*from   ww w  . j a  va2s.c  o  m*/
}

From source file:com.sdw.dream.spark.examples.ml.JavaPCAExample.java

License:Apache License

public static void main(String[] args) {
    SparkConf conf = new SparkConf().setAppName("JavaPCAExample");
    JavaSparkContext jsc = new JavaSparkContext(conf);
    SQLContext jsql = new SQLContext(jsc);

    // $example on$
    JavaRDD<Row> data = jsc.parallelize(
            Arrays.asList(RowFactory.create(Vectors.sparse(5, new int[] { 1, 3 }, new double[] { 1.0, 7.0 })),
                    RowFactory.create(Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0)),
                    RowFactory.create(Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0))));

    StructType schema = new StructType(
            new StructField[] { new StructField("features", new VectorUDT(), false, Metadata.empty()), });

    DataFrame df = jsql.createDataFrame(data, schema);

    PCAModel pca = new PCA().setInputCol("features").setOutputCol("pcaFeatures").setK(3).fit(df);

    DataFrame result = pca.transform(df).select("pcaFeatures");
    result.show();//w  w  w  .  j a v  a  2s  . c  o  m
    // $example off$
    jsc.stop();
}

From source file:com.sdw.dream.spark.examples.ml.JavaPolynomialExpansionExample.java

License:Apache License

public static void main(String[] args) {
    SparkConf conf = new SparkConf().setAppName("JavaPolynomialExpansionExample");
    JavaSparkContext jsc = new JavaSparkContext(conf);
    SQLContext jsql = new SQLContext(jsc);

    // $example on$
    PolynomialExpansion polyExpansion = new PolynomialExpansion().setInputCol("features")
            .setOutputCol("polyFeatures").setDegree(3);

    JavaRDD<Row> data = jsc.parallelize(Arrays.asList(RowFactory.create(Vectors.dense(-2.0, 2.3)),
            RowFactory.create(Vectors.dense(0.0, 0.0)), RowFactory.create(Vectors.dense(0.6, -1.1))));

    StructType schema = new StructType(
            new StructField[] { new StructField("features", new VectorUDT(), false, Metadata.empty()), });

    DataFrame df = jsql.createDataFrame(data, schema);
    DataFrame polyDF = polyExpansion.transform(df);

    Row[] row = polyDF.select("polyFeatures").take(3);
    for (Row r : row) {
        System.out.println(r.get(0));
    }/*from  w ww.j  a  v a2s  .c  o m*/
    // $example off$
    jsc.stop();
}

From source file:com.sdw.dream.spark.examples.ml.JavaStopWordsRemoverExample.java

License:Apache License

public static void main(String[] args) {
    SparkConf conf = new SparkConf().setAppName("JavaStopWordsRemoverExample");
    JavaSparkContext jsc = new JavaSparkContext(conf);
    SQLContext jsql = new SQLContext(jsc);

    // $example on$
    StopWordsRemover remover = new StopWordsRemover().setInputCol("raw").setOutputCol("filtered");

    JavaRDD<Row> rdd = jsc/* w  w w.java 2  s.c  o  m*/
            .parallelize(Arrays.asList(RowFactory.create(Arrays.asList("I", "saw", "the", "red", "baloon")),
                    RowFactory.create(Arrays.asList("Mary", "had", "a", "little", "lamb"))));

    StructType schema = new StructType(new StructField[] {
            new StructField("raw", DataTypes.createArrayType(DataTypes.StringType), false, Metadata.empty()) });

    DataFrame dataset = jsql.createDataFrame(rdd, schema);
    remover.transform(dataset).show();
    // $example off$
    jsc.stop();
}

From source file:com.sdw.dream.spark.examples.ml.JavaVectorSlicerExample.java

License:Apache License

public static void main(String[] args) {
    SparkConf conf = new SparkConf().setAppName("JavaVectorSlicerExample");
    JavaSparkContext jsc = new JavaSparkContext(conf);
    SQLContext jsql = new SQLContext(jsc);

    // $example on$
    Attribute[] attrs = new Attribute[] { NumericAttribute.defaultAttr().withName("f1"),
            NumericAttribute.defaultAttr().withName("f2"), NumericAttribute.defaultAttr().withName("f3") };
    AttributeGroup group = new AttributeGroup("userFeatures", attrs);

    JavaRDD<Row> jrdd = jsc.parallelize(Lists.newArrayList(
            RowFactory.create(Vectors.sparse(3, new int[] { 0, 1 }, new double[] { -2.0, 2.3 })),
            RowFactory.create(Vectors.dense(-2.0, 2.3, 0.0))));

    DataFrame dataset = jsql.createDataFrame(jrdd, (new StructType()).add(group.toStructField()));

    VectorSlicer vectorSlicer = new VectorSlicer().setInputCol("userFeatures").setOutputCol("features");

    vectorSlicer.setIndices(new int[] { 1 }).setNames(new String[] { "f3" });
    // or slicer.setIndices(new int[]{1, 2}), or slicer.setNames(new String[]{"f2", "f3"})

    DataFrame output = vectorSlicer.transform(dataset);

    System.out.println(output.select("userFeatures", "features").first());
    // $example off$
    jsc.stop();/*from   w w  w . ja  v a  2s  .  co  m*/
}

From source file:com.thinkbiganalytics.spark.datavalidator.functions.CleanseAndValidateRow.java

License:Apache License

@Override
public CleansedRowResult call(@Nonnull final Row row) throws Exception {
    /*//from   www .  ja  v a 2 s.c o  m
    Cache for performance. Validators accept different parameters (numeric,string, etc) so we need to resolve the type using reflection
    */
    Map<Class, Class> validatorParamType = new HashMap<>();

    int nulls = hasProcessingDttm ? 1 : 0;

    // Create placeholder for the new values plus one columns for reject_reason
    Object[] newValues = new Object[dataTypes.length + 1];
    boolean rowValid = true;
    String sbRejectReason;
    List<ValidationResult> results = null;
    boolean[] columnsValid = new boolean[dataTypes.length];

    Map<Integer, Object> originalValues = new HashMap<>();

    // Iterate through columns to cleanse and validate
    for (int idx = 0; idx < dataTypes.length; idx++) {
        ValidationResult result;
        FieldPolicy fieldPolicy = policies[idx];
        HCatDataType dataType = dataTypes[idx];
        boolean columnValid = true;
        boolean isBinaryType = dataType.getConvertibleType().equals(byte[].class);

        // Extract the value (allowing for null or missing field for odd-ball data)
        Object val = (idx == row.length() || row.isNullAt(idx) ? null : row.get(idx));
        // Handle complex types by passing them through

        if (dataType.isUnchecked()) {
            if (val == null) {
                nulls++;
            }
            newValues[idx] = val;
            originalValues.put(idx, val);
        } else {
            Object fieldValue = (val);
            boolean isEmpty;

            if (fieldValue == null) {
                nulls++;
            }
            originalValues.put(idx, fieldValue);

            StandardizationAndValidationResult standardizationAndValidationResult = standardizeAndValidateField(
                    fieldPolicy, fieldValue, dataType, validatorParamType);
            result = standardizationAndValidationResult.getFinalValidationResult();

            //only apply the standardized result value if the routine is valid
            fieldValue = result.isValid() ? standardizationAndValidationResult.getFieldValue() : fieldValue;

            //reevaluate the isEmpty flag
            isEmpty = ((fieldValue == null) || (StringUtils.isEmpty(fieldValue.toString())));

            //if the field is a binary type, but cant be converted set it to null.
            //hive will auto convert byte[] or String fields to a target binary type.
            if (result.isValid() && isBinaryType && !(fieldValue instanceof byte[])
                    && !(fieldValue instanceof String)) {
                //set it to null
                fieldValue = null;
            } else if ((dataType.isNumeric() || isBinaryType) && isEmpty) {
                //if its a numeric column and the field is empty then set it to null as well
                fieldValue = null;
            }
            newValues[idx] = fieldValue;

            if (!result.isValid()) {
                rowValid = false;
                results = (results == null ? new Vector<ValidationResult>() : results);
                results.addAll(standardizationAndValidationResult.getValidationResults());
                columnValid = false;
            }

        }

        // Record fact that we there was an invalid column
        columnsValid[idx] = columnValid;
    }
    // Return success unless all values were null.  That would indicate a blank line in the file.
    if (nulls >= dataTypes.length) {
        rowValid = false;
        results = (results == null ? new Vector<ValidationResult>() : results);
        results.add(ValidationResult.failRow("empty", "Row is empty"));
    }

    if (!rowValid) {
        for (int idx = 0; idx < dataTypes.length; idx++) {
            //if the value is not able to match the invalid dataTypes and the datatype has changed then replace with original value
            //the _invalid table dataTypes matches the source, not the destination
            if (newValues[idx] == null || originalValues.get(idx) == null
                    || newValues[idx].getClass() != originalValues.get(idx).getClass()) {
                newValues[idx] = originalValues.get(idx);
            }
            //otherwise the data has changed, but its still the same data type so we can keep the newly changed value

        }
    }

    // Convert to reject reasons to JSON
    sbRejectReason = toJSONArray(results);

    // Record the results in the appended columns, move processing partition value last
    if (hasProcessingDttm) {
        newValues[dataTypes.length] = newValues[dataTypes.length - 1]; //PROCESSING_DTTM_COL
        newValues[dataTypes.length - 1] = sbRejectReason; //REJECT_REASON_COL
    } else {
        newValues[dataTypes.length] = sbRejectReason;
    }

    return new CleansedRowResult(RowFactory.create(newValues), columnsValid, rowValid);
}

From source file:com.thinkbiganalytics.spark.jdbc.RowTransform.java

License:Apache License

/**
 * Converts the specified JDBC ResultSet into a Spark SQL Row.
 *
 * @param rs the result set/*from  www . j  a  v  a2  s  .c  o  m*/
 * @return the Spark SQL row
 * @throws SQLException if a SQL error occurs
 */
@Nonnull
private Row mapRow(@Nonnull final ResultSet rs) throws SQLException {
    final Converter[] converters = getConverters(rs);
    final int columnCount = converters.length;
    final Object[] values = new Object[columnCount];

    for (int i = 0; i < columnCount; ++i) {
        values[i] = converters[i].convert(rs, i + 1);
    }

    return RowFactory.create(values);
}