fire.nodes.dataset.NodeDatasetFileOrDirectoryCSV.java Source code

Introduction

Here is the source code for fire.nodes.dataset.NodeDatasetFileOrDirectoryCSV.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package fire.nodes.dataset;

import fire.workflowengine.WorkflowContext;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;

import java.io.Serializable;
import java.util.LinkedList;
import java.util.List;

/**
 * Created by jayantshekhar
 * Represents a Dataset Node which points to data in a File or Directory with data in csv format.
 */

public class NodeDatasetFileOrDirectoryCSV extends NodeDatasetFileOrDirectory implements Serializable {

    // field separator in the input file
    public String separator = ",";

    // filter the lines that contain this string. used to filter out the header lines
    public String filterLinesContaining = null;

    public NodeDatasetFileOrDirectoryCSV(int i, String nm, String p) {
        super(i, nm, p);
    }

    public NodeDatasetFileOrDirectoryCSV(int i, String nm, String p, String cols, String colTypes,
            String colmlTypes) {
        super(i, nm, p, cols, colTypes);
    }

    public NodeDatasetFileOrDirectoryCSV() {

    }

    public Object parseField(String string, StructField field) {

        Object f1;

        if (field.dataType().sameType(DataTypes.IntegerType)) {
            f1 = Integer.parseInt(string);
            return f1;
        }
        if (field.dataType().sameType(DataTypes.DoubleType)) {
            f1 = Double.parseDouble(string);
            return f1;
        }
        if (field.dataType().sameType(DataTypes.StringType)) {
            return string.trim();
        }

        return string;
    }

    //------------------------------------------------------------------------------------------------------

    @Override
    public void execute(JavaSparkContext ctx, SQLContext sqlContext, WorkflowContext workflowContext,
            DataFrame df) {

        workflowContext.out("Executing NodeDatasetFileOrDirectoryCSV : " + id);

        // Load a text file and convert each line to a JavaBean.
        JavaRDD<String> people = ctx.textFile(path);

        // filter the header row
        if (filterLinesContaining != null) {
            people = people.filter(new Function<String, Boolean>() {
                @Override
                public Boolean call(String s) throws Exception {
                    if (s.contains(filterLinesContaining))
                        return false;

                    return true;
                }
            });
        }

        // get schema
        final StructType schema = getSparkSQLSchema();

        // Convert records of the RDD (people) to Rows.
        JavaRDD<Row> rowRDD = people.flatMap(new FlatMapFunction<String, Row>() {

            @Override
            public Iterable<Row> call(String record) throws Exception {
                List<Row> ll = new LinkedList<Row>();

                String[] fields = record.split(separator);

                // skip invalid records
                if (fields.length != schema.length())
                    return ll;

                Object f[] = new Object[fields.length];
                int idx = 0;
                for (String field : fields) {
                    f[idx] = parseField(fields[idx], schema.fields()[idx]);
                    idx++;
                }

                Row row = RowFactory.create(f);
                ll.add(row);

                return ll;
            }
        });

        // Apply the schema to the RDD.
        DataFrame tdf = sqlContext.createDataFrame(rowRDD, schema);

        super.execute(ctx, sqlContext, workflowContext, tdf);
    }

    //------------------------------------------------------------------------------------------------------

}