Example usage for org.apache.spark.sql RowFactory create

Introduction

In this page you can find the example usage for org.apache.spark.sql RowFactory create.

Prototype

public static Row create(Object... values)

Source Link

Document

Create a Row from the given arguments.

Usage

From source file:com.estonteco.spark.frames.conf.factory.creator.impl.TextFileFrameCreator.java

public IDataFrame create(JavaSparkContext javaSparkContext, SQLContext context,
        DefaultFrameConf configuration) {
    Map<String, String> properties = configuration.getProperties();
    String url = getValue(properties, "URL", "");
    JavaRDD<String> textFile = javaSparkContext.textFile(url);
    final StructType schema = configuration.getSchema();
    final String delimiter = getValue(properties, "delimiter", ",");
    final boolean readHeader = Boolean.valueOf(getValue(properties, "header", "false"));
    JavaRDD<Row> rows = textFile.map(new Function<String, Row>() {
        long rowIndex = 0;

        public Row call(String record) throws Exception {
            Object[] cells = new Object[schema.size()];
            if (!readHeader && rowIndex++ == 0) {
                return RowFactory.create(cells);
            }//from   ww  w.  ja  v  a  2 s .  c  o  m
            String[] parts = record.split(delimiter);
            int i = 0;
            for (StructField sf : schema.fields()) {
                String cell = parts[i];
                if (sf.nullable() && (cell == null || cell.isEmpty())) {
                    cells[i] = null;
                } else {
                    cells[i] = cast(cell, sf.dataType());
                }
                i++;
            }
            return RowFactory.create(cells);
        }
    });
    DataFrame table = context.createDataFrame(rows, schema);
    table.registerTempTable(configuration.getName());
    if (configuration.isCache()) {
        table.cache();
    }
    return new DefaultDataFrame(table, configuration, State.INIT);
}

From source file:com.sdw.dream.spark.examples.ml.JavaBucketizerExample.java

License:Apache License

public static void main(String[] args) {
    SparkConf conf = new SparkConf().setAppName("JavaBucketizerExample");
    JavaSparkContext jsc = new JavaSparkContext(conf);
    SQLContext jsql = new SQLContext(jsc);

    // $example on$
    double[] splits = { Double.NEGATIVE_INFINITY, -0.5, 0.0, 0.5, Double.POSITIVE_INFINITY };

    JavaRDD<Row> data = jsc.parallelize(Arrays.asList(RowFactory.create(-0.5), RowFactory.create(-0.3),
            RowFactory.create(0.0), RowFactory.create(0.2)));
    StructType schema = new StructType(
            new StructField[] { new StructField("features", DataTypes.DoubleType, false, Metadata.empty()) });
    DataFrame dataFrame = jsql.createDataFrame(data, schema);

    Bucketizer bucketizer = new Bucketizer().setInputCol("features").setOutputCol("bucketedFeatures")
            .setSplits(splits);/*from  w  w w  .ja  va2 s . c  o m*/

    // Transform original data into its bucket index.
    DataFrame bucketedData = bucketizer.transform(dataFrame);
    bucketedData.show();
    // $example off$
    jsc.stop();
}

From source file:com.sdw.dream.spark.examples.ml.JavaCountVectorizerExample.java

License:Apache License

public static void main(String[] args) {

    SparkConf conf = new SparkConf().setAppName("JavaCountVectorizerExample");
    JavaSparkContext jsc = new JavaSparkContext(conf);
    SQLContext sqlContext = new SQLContext(jsc);

    // $example on$
    // Input data: Each row is a bag of words from a sentence or document.
    JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(RowFactory.create(Arrays.asList("a", "b", "c")),
            RowFactory.create(Arrays.asList("a", "b", "b", "c", "a"))));
    StructType schema = new StructType(new StructField[] {
            new StructField("text", new ArrayType(DataTypes.StringType, true), false, Metadata.empty()) });
    DataFrame df = sqlContext.createDataFrame(jrdd, schema);

    // fit a CountVectorizerModel from the corpus
    CountVectorizerModel cvModel = new CountVectorizer().setInputCol("text").setOutputCol("feature")
            .setVocabSize(3).setMinDF(2).fit(df);

    // alternatively, define CountVectorizerModel with a-priori vocabulary
    CountVectorizerModel cvm = new CountVectorizerModel(new String[] { "a", "b", "c" }).setInputCol("text")
            .setOutputCol("feature");

    cvModel.transform(df).show();//from   ww w.  j  ava 2  s .  co  m
    // $example off$
}

From source file:com.sdw.dream.spark.examples.ml.JavaDCTExample.java

License:Apache License

public static void main(String[] args) {
    SparkConf conf = new SparkConf().setAppName("JavaDCTExample");
    JavaSparkContext jsc = new JavaSparkContext(conf);
    SQLContext jsql = new SQLContext(jsc);

    // $example on$
    JavaRDD<Row> data = jsc.parallelize(Arrays.asList(RowFactory.create(Vectors.dense(0.0, 1.0, -2.0, 3.0)),
            RowFactory.create(Vectors.dense(-1.0, 2.0, 4.0, -7.0)),
            RowFactory.create(Vectors.dense(14.0, -2.0, -5.0, 1.0))));
    StructType schema = new StructType(
            new StructField[] { new StructField("features", new VectorUDT(), false, Metadata.empty()), });
    DataFrame df = jsql.createDataFrame(data, schema);
    DCT dct = new DCT().setInputCol("features").setOutputCol("featuresDCT").setInverse(false);
    DataFrame dctDf = dct.transform(df);
    dctDf.select("featuresDCT").show(3);
    // $example off$
    jsc.stop();/*from   ww w  . j a  va2s.c  o  m*/
}

From source file:com.sdw.dream.spark.examples.ml.JavaPCAExample.java

License:Apache License

public static void main(String[] args) {
    SparkConf conf = new SparkConf().setAppName("JavaPCAExample");
    JavaSparkContext jsc = new JavaSparkContext(conf);
    SQLContext jsql = new SQLContext(jsc);

    // $example on$
    JavaRDD<Row> data = jsc.parallelize(
            Arrays.asList(RowFactory.create(Vectors.sparse(5, new int[] { 1, 3 }, new double[] { 1.0, 7.0 })),
                    RowFactory.create(Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0)),
                    RowFactory.create(Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0))));

    StructType schema = new StructType(
            new StructField[] { new StructField("features", new VectorUDT(), false, Metadata.empty()), });

    DataFrame df = jsql.createDataFrame(data, schema);

    PCAModel pca = new PCA().setInputCol("features").setOutputCol("pcaFeatures").setK(3).fit(df);

    DataFrame result = pca.transform(df).select("pcaFeatures");
    result.show();//w  w  w  .  j a v  a  2s  . c  o  m
    // $example off$
    jsc.stop();
}

From source file:com.sdw.dream.spark.examples.ml.JavaPolynomialExpansionExample.java

License:Apache License

public static void main(String[] args) {
    SparkConf conf = new SparkConf().setAppName("JavaPolynomialExpansionExample");
    JavaSparkContext jsc = new JavaSparkContext(conf);
    SQLContext jsql = new SQLContext(jsc);

    // $example on$
    PolynomialExpansion polyExpansion = new PolynomialExpansion().setInputCol("features")
            .setOutputCol("polyFeatures").setDegree(3);

    JavaRDD<Row> data = jsc.parallelize(Arrays.asList(RowFactory.create(Vectors.dense(-2.0, 2.3)),
            RowFactory.create(Vectors.dense(0.0, 0.0)), RowFactory.create(Vectors.dense(0.6, -1.1))));

    StructType schema = new StructType(
            new StructField[] { new StructField("features", new VectorUDT(), false, Metadata.empty()), });

    DataFrame df = jsql.createDataFrame(data, schema);
    DataFrame polyDF = polyExpansion.transform(df);

    Row[] row = polyDF.select("polyFeatures").take(3);
    for (Row r : row) {
        System.out.println(r.get(0));
    }/*from  w ww.j  a  v a2s  .c  o m*/
    // $example off$
    jsc.stop();
}

From source file:com.sdw.dream.spark.examples.ml.JavaStopWordsRemoverExample.java

License:Apache License

public static void main(String[] args) {
    SparkConf conf = new SparkConf().setAppName("JavaStopWordsRemoverExample");
    JavaSparkContext jsc = new JavaSparkContext(conf);
    SQLContext jsql = new SQLContext(jsc);

    // $example on$
    StopWordsRemover remover = new StopWordsRemover().setInputCol("raw").setOutputCol("filtered");

    JavaRDD<Row> rdd = jsc/* w  w w.java 2  s.c  o  m*/
            .parallelize(Arrays.asList(RowFactory.create(Arrays.asList("I", "saw", "the", "red", "baloon")),
                    RowFactory.create(Arrays.asList("Mary", "had", "a", "little", "lamb"))));

    StructType schema = new StructType(new StructField[] {
            new StructField("raw", DataTypes.createArrayType(DataTypes.StringType), false, Metadata.empty()) });

    DataFrame dataset = jsql.createDataFrame(rdd, schema);
    remover.transform(dataset).show();
    // $example off$
    jsc.stop();
}

From source file:com.sdw.dream.spark.examples.ml.JavaVectorSlicerExample.java

License:Apache License

public static void main(String[] args) {
    SparkConf conf = new SparkConf().setAppName("JavaVectorSlicerExample");
    JavaSparkContext jsc = new JavaSparkContext(conf);
    SQLContext jsql = new SQLContext(jsc);

    // $example on$
    Attribute[] attrs = new Attribute[] { NumericAttribute.defaultAttr().withName("f1"),
            NumericAttribute.defaultAttr().withName("f2"), NumericAttribute.defaultAttr().withName("f3") };
    AttributeGroup group = new AttributeGroup("userFeatures", attrs);

    JavaRDD<Row> jrdd = jsc.parallelize(Lists.newArrayList(
            RowFactory.create(Vectors.sparse(3, new int[] { 0, 1 }, new double[] { -2.0, 2.3 })),
            RowFactory.create(Vectors.dense(-2.0, 2.3, 0.0))));

    DataFrame dataset = jsql.createDataFrame(jrdd, (new StructType()).add(group.toStructField()));

    VectorSlicer vectorSlicer = new VectorSlicer().setInputCol("userFeatures").setOutputCol("features");

    vectorSlicer.setIndices(new int[] { 1 }).setNames(new String[] { "f3" });
    // or slicer.setIndices(new int[]{1, 2}), or slicer.setNames(new String[]{"f2", "f3"})

    DataFrame output = vectorSlicer.transform(dataset);

    System.out.println(output.select("userFeatures", "features").first());
    // $example off$
    jsc.stop();/*from   w w  w . ja  v a  2s  .  co  m*/
}

From source file:com.thinkbiganalytics.spark.datavalidator.functions.CleanseAndValidateRow.java

License:Apache License

@Override
public CleansedRowResult call(@Nonnull final Row row) throws Exception {
    /*//from   www .  ja  v a 2 s.c o  m
    Cache for performance. Validators accept different parameters (numeric,string, etc) so we need to resolve the type using reflection
    */
    Map<Class, Class> validatorParamType = new HashMap<>();

    int nulls = hasProcessingDttm ? 1 : 0;

    // Create placeholder for the new values plus one columns for reject_reason
    Object[] newValues = new Object[dataTypes.length + 1];
    boolean rowValid = true;
    String sbRejectReason;
    List<ValidationResult> results = null;
    boolean[] columnsValid = new boolean[dataTypes.length];

    Map<Integer, Object> originalValues = new HashMap<>();

    // Iterate through columns to cleanse and validate
    for (int idx = 0; idx < dataTypes.length; idx++) {
        ValidationResult result;
        FieldPolicy fieldPolicy = policies[idx];
        HCatDataType dataType = dataTypes[idx];
        boolean columnValid = true;
        boolean isBinaryType = dataType.getConvertibleType().equals(byte[].class);

        // Extract the value (allowing for null or missing field for odd-ball data)
        Object val = (idx == row.length() || row.isNullAt(idx) ? null : row.get(idx));
        // Handle complex types by passing them through

        if (dataType.isUnchecked()) {
            if (val == null) {
                nulls++;
            }
            newValues[idx] = val;
            originalValues.put(idx, val);
        } else {
            Object fieldValue = (val);
            boolean isEmpty;

            if (fieldValue == null) {
                nulls++;
            }
            originalValues.put(idx, fieldValue);

            StandardizationAndValidationResult standardizationAndValidationResult = standardizeAndValidateField(
                    fieldPolicy, fieldValue, dataType, validatorParamType);
            result = standardizationAndValidationResult.getFinalValidationResult();

            //only apply the standardized result value if the routine is valid
            fieldValue = result.isValid() ? standardizationAndValidationResult.getFieldValue() : fieldValue;

            //reevaluate the isEmpty flag
            isEmpty = ((fieldValue == null) || (StringUtils.isEmpty(fieldValue.toString())));

            //if the field is a binary type, but cant be converted set it to null.
            //hive will auto convert byte[] or String fields to a target binary type.
            if (result.isValid() && isBinaryType && !(fieldValue instanceof byte[])
                    && !(fieldValue instanceof String)) {
                //set it to null
                fieldValue = null;
            } else if ((dataType.isNumeric() || isBinaryType) && isEmpty) {
                //if its a numeric column and the field is empty then set it to null as well
                fieldValue = null;
            }
            newValues[idx] = fieldValue;

            if (!result.isValid()) {
                rowValid = false;
                results = (results == null ? new Vector<ValidationResult>() : results);
                results.addAll(standardizationAndValidationResult.getValidationResults());
                columnValid = false;
            }

        }

        // Record fact that we there was an invalid column
        columnsValid[idx] = columnValid;
    }
    // Return success unless all values were null.  That would indicate a blank line in the file.
    if (nulls >= dataTypes.length) {
        rowValid = false;
        results = (results == null ? new Vector<ValidationResult>() : results);
        results.add(ValidationResult.failRow("empty", "Row is empty"));
    }

    if (!rowValid) {
        for (int idx = 0; idx < dataTypes.length; idx++) {
            //if the value is not able to match the invalid dataTypes and the datatype has changed then replace with original value
            //the _invalid table dataTypes matches the source, not the destination
            if (newValues[idx] == null || originalValues.get(idx) == null
                    || newValues[idx].getClass() != originalValues.get(idx).getClass()) {
                newValues[idx] = originalValues.get(idx);
            }
            //otherwise the data has changed, but its still the same data type so we can keep the newly changed value

        }
    }

    // Convert to reject reasons to JSON
    sbRejectReason = toJSONArray(results);

    // Record the results in the appended columns, move processing partition value last
    if (hasProcessingDttm) {
        newValues[dataTypes.length] = newValues[dataTypes.length - 1]; //PROCESSING_DTTM_COL
        newValues[dataTypes.length - 1] = sbRejectReason; //REJECT_REASON_COL
    } else {
        newValues[dataTypes.length] = sbRejectReason;
    }

    return new CleansedRowResult(RowFactory.create(newValues), columnsValid, rowValid);
}

From source file:com.thinkbiganalytics.spark.jdbc.RowTransform.java

License:Apache License

/**
 * Converts the specified JDBC ResultSet into a Spark SQL Row.
 *
 * @param rs the result set/*from  www . j  a  v  a2  s  .c  o  m*/
 * @return the Spark SQL row
 * @throws SQLException if a SQL error occurs
 */
@Nonnull
private Row mapRow(@Nonnull final ResultSet rs) throws SQLException {
    final Converter[] converters = getConverters(rs);
    final int columnCount = converters.length;
    final Object[] values = new Object[columnCount];

    for (int i = 0; i < columnCount; ++i) {
        values[i] = converters[i].convert(rs, i + 1);
    }

    return RowFactory.create(values);
}