com.thinkbiganalytics.kylo.catalog.spark.sources.AbstractJdbcDataSetProvider.java Source code

Introduction

Here is the source code for com.thinkbiganalytics.kylo.catalog.spark.sources.AbstractJdbcDataSetProvider.java
Source

package com.thinkbiganalytics.kylo.catalog.spark.sources;

/*-
 * #%L
 * Kylo Catalog Core
 * %%
 * Copyright (C) 2017 - 2018 ThinkBig Analytics
 * %%
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * #L%
 */

import com.google.common.annotations.VisibleForTesting;
import com.thinkbiganalytics.kylo.catalog.api.KyloCatalogClient;
import com.thinkbiganalytics.kylo.catalog.api.KyloCatalogException;
import com.thinkbiganalytics.kylo.catalog.spark.SparkUtil;
import com.thinkbiganalytics.kylo.catalog.spark.sources.jdbc.JdbcHighWaterMark;
import com.thinkbiganalytics.kylo.catalog.spark.sources.jdbc.JdbcHighWaterMarkAccumulableParam;
import com.thinkbiganalytics.kylo.catalog.spark.sources.jdbc.JdbcHighWaterMarkVisitor;
import com.thinkbiganalytics.kylo.catalog.spark.sources.jdbc.JdbcRelationProvider;
import com.thinkbiganalytics.kylo.catalog.spi.DataSetOptions;
import com.thinkbiganalytics.kylo.catalog.spi.DataSetProvider;

import org.apache.commons.lang3.StringUtils;
import org.apache.spark.Accumulable;
import org.apache.spark.AccumulableParam;
import org.apache.spark.sql.Column;
import org.apache.spark.sql.DataFrameReader;
import org.apache.spark.sql.DataFrameWriter;
import org.apache.spark.sql.functions;
import org.apache.spark.sql.types.DataType;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructType;
import org.joda.time.DateTimeUtils;
import org.joda.time.format.DateTimeFormat;
import org.joda.time.format.DateTimeFormatter;
import org.joda.time.format.ISODateTimeFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.Serializable;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.sql.Date;
import java.sql.Timestamp;
import java.util.Properties;
import java.util.UUID;

import javax.annotation.Nonnull;
import javax.annotation.Nullable;

import scala.Function1;
import scala.Option;
import scala.runtime.AbstractFunction1;

/**
 * Base implementation of a data set provider that can read from and write to JDBC tables.
 *
 * @param <T> Spark {@code DataFrame} class
 */
public abstract class AbstractJdbcDataSetProvider<T> implements DataSetProvider<T> {

    private static final Logger log = LoggerFactory.getLogger(AbstractJdbcDataSetProvider.class);

    /**
     * Option key to specify the date field for incremental loading
     */
    private static final String DATE_FIELD_OPTION = "datefield";

    /**
     * Option key to specify the high water mark name for reading files
     */
    private static final String HIGH_WATER_MARK_OPTION = "highwatermark";

    /**
     * Option key to delay loading of rows from the previous specified seconds
     */
    private static final String OVERLAP_OPTION = "overlap";

    @Override
    public boolean supportsFormat(@Nonnull final String source) {
        return "jdbc".equalsIgnoreCase(source) || "org.apache.spark.sql.jdbc".equals(source)
                || "org.apache.spark.sql.jdbc.DefaultSource".equals(source)
                || "org.apache.spark.sql.execution.datasources.jdbc".equals(source)
                || "org.apache.spark.sql.execution.datasources.jdbc.DefaultSource".equals(source)
                || "org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider".equals(source);
    }

    @Nonnull
    @Override
    public final T read(@Nonnull final KyloCatalogClient<T> client, @Nonnull final DataSetOptions options) {
        // Set url for PostgreSQL databases
        final Option<String> catalog = options.getOption("PGDBNAME");
        final Option<String> url = options.getOption("url");
        if (catalog.isDefined() && url.isDefined() && url.get().startsWith("jdbc:postgres://")) {
            final String[] urlSplit = url.get().split("\\?", 2);
            final String[] pathSplit = urlSplit[0].substring(16).split("/", 2);

            if (pathSplit.length == 1 || StringUtils.equalsAny(pathSplit[1], "", "/")) {
                String catalogUrl = "jdbc:postgres://" + pathSplit[0] + "/" + urlEncode(catalog.get());
                if (urlSplit.length == 2) {
                    catalogUrl += "?" + urlSplit[1];
                }
                options.setOption("url", catalogUrl);
            }
        }

        // Load data set
        final DataFrameReader reader = SparkUtil.prepareDataFrameReader(getDataFrameReader(client, options),
                options, null);
        reader.format(JdbcRelationProvider.class.getName());
        T dataSet = load(reader);

        // Handle high water mark
        final String dateField = SparkUtil.getOrElse(options.getOption(DATE_FIELD_OPTION), null);
        final String highWaterMarkKey = SparkUtil.getOrElse(options.getOption(HIGH_WATER_MARK_OPTION), null);
        final Long overlap = getOverlap(options);

        if (dateField != null && highWaterMarkKey != null) {
            final JdbcHighWaterMark initialValue = createHighWaterMark(highWaterMarkKey, client);
            dataSet = filterByDateTime(dataSet, dateField, initialValue.getValue(), overlap);
            dataSet = updateHighWaterMark(dataSet, dateField, initialValue, client);
        } else if (highWaterMarkKey != null) {
            log.warn("Ignoring '{}' option because '{}' option was not specified", HIGH_WATER_MARK_OPTION,
                    DATE_FIELD_OPTION);
        } else if (overlap != null) {
            log.warn("Ignoring '{}' option because '{}' and '{}' options were not specified", OVERLAP_OPTION,
                    DATE_FIELD_OPTION, HIGH_WATER_MARK_OPTION);
        }

        return dataSet;
    }

    @Override
    public final void write(@Nonnull final KyloCatalogClient<T> client, @Nonnull final DataSetOptions options,
            @Nonnull final T dataSet) {
        // Extract JDBC options
        final String url = DataSetUtil.getOptionOrThrow(options, "url", "Option 'url' is required");
        final String table = DataSetUtil.getOptionOrThrow(options, "dbtable", "Option 'dbtable' is required");

        final Properties properties = new Properties();
        properties.putAll(options.getOptions());

        // Write to JDBC table
        final DataFrameWriter writer = SparkUtil.prepareDataFrameWriter(getDataFrameWriter(dataSet, options),
                options, null);
        writer.jdbc(url, table, properties);
    }

    /**
     * Creates an {@link Accumulable} shared variable with a name for display in the Spark UI.
     */
    @Nonnull
    protected abstract <R, P1> Accumulable<R, P1> accumulable(@Nonnull R initialValue, @Nonnull String name,
            @Nonnull AccumulableParam<R, P1> param, @Nonnull KyloCatalogClient<T> client);

    /**
     * Filters rows using the specified condition.
     */
    @Nonnull
    protected abstract T filter(@Nonnull T dataSet, @Nonnull Column condition);

    /**
     * Gets a reader from the specified client.
     *
     * <p>The options, format, and scheme will be applied to the reader before loading.</p>
     */
    @Nonnull
    protected abstract DataFrameReader getDataFrameReader(@Nonnull KyloCatalogClient<T> client,
            @Nonnull DataSetOptions options);

    /**
     * Gets a writer for the specified data set.
     *
     * <p>The options, format, mode, and partitioning will be applied to the writer before saving.</p>
     */
    @Nonnull
    protected abstract DataFrameWriter getDataFrameWriter(@Nonnull T dataSet, @Nonnull DataSetOptions options);

    /**
     * Loads a data set using the specified reader and filter.
     */
    @Nonnull
    protected abstract T load(@Nonnull DataFrameReader reader);

    /**
     * Applies the specified function to the specified field of the data set.
     */
    @Nonnull
    protected abstract T map(@Nonnull T dataSet, @Nonnull String fieldName, @Nonnull Function1 function,
            @Nonnull DataType returnType);

    /**
     * Returns the schema of the specified data set.
     */
    @Nonnull
    protected abstract StructType schema(@Nonnull T dataSet);

    /**
     * Creates a {@link JdbcHighWaterMark} using the specified high water mark.
     *
     * <p>The value is initialized using the {@link KyloCatalogClient}.</p>
     */
    @Nonnull
    @VisibleForTesting
    JdbcHighWaterMark createHighWaterMark(@Nonnull final String highWaterMarkKey,
            @Nonnull final KyloCatalogClient<T> client) {
        final JdbcHighWaterMark highWaterMark = new JdbcHighWaterMark(highWaterMarkKey, client);
        highWaterMark.setFormatter(new LongToDateTime());

        // Set value
        final String value = client.getHighWaterMarks().get(highWaterMarkKey);
        if (value != null) {
            try {
                highWaterMark.accumulate(ISODateTimeFormat.dateTimeParser().withZoneUTC().parseMillis(value));
            } catch (final IllegalArgumentException e) {
                throw new KyloCatalogException(
                        "Invalid value for high water mark " + highWaterMarkKey + ": " + value, e);
            }
        }

        return highWaterMark;
    }

    /**
     * Filters the specified data set using the specified date field.
     */
    @Nonnull
    @VisibleForTesting
    T filterByDateTime(@Nonnull final T dataSet, @Nonnull final String fieldName, @Nullable final Long value,
            @Nullable final Long overlap) {
        long startTime = 0;
        long endTime = DateTimeUtils.currentTimeMillis();

        // Parse high water mark
        if (value != null) {
            if (value < endTime) {
                startTime = value;
            } else {
                log.warn("Value for high water mark is the future: {}", value);
                startTime = endTime;
            }
        }

        // Parse overlap
        if (overlap != null) {
            startTime = Math.max(startTime - overlap, 0);
            endTime -= overlap;
        }

        // Return filter
        final Column dateColumn = new Column(fieldName);
        final Column startFilter = (startTime > 0) ? dateColumn.gt(functions.lit(new Timestamp(startTime))) : null;
        final Column endFilter = dateColumn.lt(functions.lit(new Timestamp(endTime)));
        return filter(dataSet, (startFilter != null) ? startFilter.and(endFilter) : endFilter);
    }

    /**
     * Gets the overlap from the specified options.
     */
    @Nullable
    @VisibleForTesting
    Long getOverlap(@Nonnull final DataSetOptions options) {
        final String overlap = SparkUtil.getOrElse(options.getOption(OVERLAP_OPTION), null);

        if (overlap != null) {
            try {
                return Math.abs(Long.parseLong(overlap) * 1000);
            } catch (final NumberFormatException e) {
                throw new KyloCatalogException("Invalid value for 'overlap' option: " + overlap);
            }
        } else {
            return null;
        }
    }

    /**
     * Scans the specified field and updates the specified high water mark.
     */
    @Nonnull
    @VisibleForTesting
    T updateHighWaterMark(@Nonnull final T dataSet, @Nonnull final String fieldName,
            @Nonnull final JdbcHighWaterMark highWaterMark, @Nonnull final KyloCatalogClient<T> client) {
        // Determine function to convert column to Long
        final DataType fieldType = schema(dataSet).apply(fieldName).dataType();
        final Function1<?, Long> toLong;

        if (fieldType == DataTypes.DateType) {
            toLong = new DateToLong();
        } else if (fieldType == DataTypes.TimestampType) {
            toLong = new TimestampToLong();
        } else {
            throw new KyloCatalogException("Unsupported column type for high water mark: " + fieldType);
        }

        // Create UDF and apply to field
        final String accumulableId = (highWaterMark.getName() != null) ? highWaterMark.getName()
                : UUID.randomUUID().toString();
        final Accumulable<JdbcHighWaterMark, Long> accumulable = accumulable(highWaterMark, accumulableId,
                new JdbcHighWaterMarkAccumulableParam(), client);

        final JdbcHighWaterMarkVisitor<?> visitor = new JdbcHighWaterMarkVisitor<>(accumulable, toLong);
        return map(dataSet, fieldName, visitor, fieldType);
    }

    @Nonnull
    private String urlEncode(@Nonnull final String s) {
        try {
            return URLEncoder.encode(s, "UTF-8");
        } catch (final UnsupportedEncodingException e) {
            throw new KyloCatalogException(e.toString(), e);
        }
    }

    /**
     * Converts a {@link Date} value to a {@link Long} value.
     */
    private static class DateToLong extends AbstractFunction1<Date, Long> implements Serializable {

        private static final long serialVersionUID = -8050299509057508031L;

        @Nullable
        @Override
        public Long apply(@Nullable final Date value) {
            return (value != null) ? value.getTime() : null;
        }
    }

    /**
     * Converts a {@link Long} value to a date and time string.
     */
    private static class LongToDateTime extends AbstractFunction1<Long, String> implements Serializable {

        private static final long serialVersionUID = 3461139184234190000L;

        /**
         * Date and time format for high water mark values
         */
        private static final DateTimeFormatter FORMATTER = DateTimeFormat.forPattern("yyyy-MM-dd'T'HH:mm:ss")
                .withZoneUTC();

        @Nullable
        @Override
        public String apply(@Nullable final Long value) {
            return (value != null) ? FORMATTER.print(value) : null;
        }
    }

    /**
     * Converts a {@link Timestamp} value to a {@link Long} value.
     */
    private static class TimestampToLong extends AbstractFunction1<Timestamp, Long> implements Serializable {

        private static final long serialVersionUID = -2438416767972800004L;

        @Nullable
        @Override
        public Long apply(@Nullable final Timestamp value) {
            return (value != null) ? value.getTime() : null;
        }
    }
}