org.apache.gobblin.data.management.conversion.hive.validation.ValidationJob.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.gobblin.data.management.conversion.hive.validation.ValidationJob.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.gobblin.data.management.conversion.hive.validation;

import org.apache.gobblin.config.client.ConfigClient;
import org.apache.gobblin.config.client.api.VersionStabilityPolicy;
import org.apache.gobblin.configuration.ConfigurationKeys;
import org.apache.gobblin.data.management.conversion.hive.task.HiveConverterUtils;
import org.apache.gobblin.util.PathUtils;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.text.DateFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.UUID;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;

import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.metastore.IMetaStoreClient;
import org.apache.hadoop.hive.ql.metadata.Partition;
import org.apache.hadoop.hive.ql.metadata.Table;
import org.apache.log4j.Logger;
import org.joda.time.DateTime;
import org.slf4j.LoggerFactory;

import azkaban.jobExecutor.AbstractJob;

import com.google.common.base.Charsets;
import com.google.common.base.Enums;
import com.google.common.base.Optional;
import com.google.common.base.Preconditions;
import com.google.common.base.Splitter;
import com.google.common.base.Throwables;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.io.Closer;
import com.google.common.util.concurrent.UncheckedExecutionException;
import com.typesafe.config.Config;
import com.typesafe.config.ConfigFactory;

import org.apache.gobblin.data.management.conversion.hive.dataset.ConvertibleHiveDataset;
import org.apache.gobblin.data.management.conversion.hive.dataset.ConvertibleHiveDatasetFinder;
import org.apache.gobblin.data.management.conversion.hive.events.EventConstants;
import org.apache.gobblin.data.management.conversion.hive.provider.HiveUnitUpdateProvider;
import org.apache.gobblin.data.management.conversion.hive.provider.UpdateNotFoundException;
import org.apache.gobblin.data.management.conversion.hive.provider.UpdateProviderFactory;
import org.apache.gobblin.data.management.conversion.hive.query.HiveValidationQueryGenerator;
import org.apache.gobblin.data.management.conversion.hive.source.HiveSource;
import org.apache.gobblin.data.management.copy.hive.HiveDataset;
import org.apache.gobblin.data.management.copy.hive.HiveDatasetFinder;
import org.apache.gobblin.data.management.copy.hive.HiveUtils;
import org.apache.gobblin.hive.HiveMetastoreClientPool;
import org.apache.gobblin.hive.HiveSerDeWrapper;
import org.apache.gobblin.util.HiveJdbcConnector;
import org.apache.gobblin.instrumented.Instrumented;
import org.apache.gobblin.metrics.MetricContext;
import org.apache.gobblin.metrics.event.EventSubmitter;
import org.apache.gobblin.util.AutoReturnableObject;
import org.apache.gobblin.util.ConfigUtils;
import org.apache.gobblin.util.ExecutorsUtils;
import org.apache.gobblin.util.HadoopUtils;

/**
 * Azkaban job that runs validation of conversion between two Hive tables
 *
 * @author Abhishek Tiwari
 */
public class ValidationJob extends AbstractJob {
    private static final Logger log = Logger.getLogger(ValidationJob.class);

    /***
     * Validation Job validates the table and / or partitions updated within a specific window.
     * This window is determined as follows:
     * Start ($start_time) : CURRENT_TIME - hive.source.maximum.lookbackDays
     * End   ($end_time)   : CURRENT_TIME - hive.source.skip.recentThanDays
     * ie. the resultant window for validation is: $start_time <= window <= $end_time
     */
    private static final String HIVE_SOURCE_SKIP_RECENT_THAN_DAYS_KEY = "hive.source.skip.recentThanDays";
    private static final String HIVE_DATASET_CONFIG_AVRO_PREFIX = "hive.conversion.avro";
    private static final String DEFAULT_HIVE_SOURCE_MAXIMUM_LOOKBACK_DAYS = "3";
    private static final String DEFAULT_HIVE_SOURCE_SKIP_RECENT_THAN_DAYS = "1";

    private static final String MAX_THREAD_COUNT = "validation.maxThreadCount";
    private static final String DEFAULT_MAX_THREAD_COUNT = "50";
    private static final String VALIDATION_TYPE_KEY = "hive.validation.type";
    private static final String HIVE_VALIDATION_IGNORE_DATA_PATH_IDENTIFIER_KEY = "hive.validation.ignoreDataPathIdentifier";
    private static final String DEFAULT_HIVE_VALIDATION_IGNORE_DATA_PATH_IDENTIFIER = org.apache.commons.lang.StringUtils.EMPTY;
    private static final Splitter COMMA_BASED_SPLITTER = Splitter.on(",").omitEmptyStrings().trimResults();
    private static final Splitter EQUALITY_SPLITTER = Splitter.on("=").omitEmptyStrings().trimResults();
    private static final Splitter SLASH_SPLITTER = Splitter.on("/").omitEmptyStrings().trimResults();
    private static final String VALIDATION_FILE_FORMAT_KEY = "hive.validation.fileFormat";
    private static final String IS_NESTED_ORC = "hive.validation.isNestedORC";
    private static final String DEFAULT_IS_NESTED_ORC = "false";
    private static final String HIVE_SETTINGS = "hive.settings";
    private static final String DATEPARTITION = "datepartition";
    private static final String DATE_FORMAT = "yyyy-MM-dd-HH";
    public static final String GOBBLIN_CONFIG_TAGS_WHITELIST = "gobblin.config.tags.whitelist";

    private final ValidationType validationType;
    private List<String> ignoreDataPathIdentifierList;
    private final List<Throwable> throwables;
    private final Properties props;
    private final MetricContext metricContext;
    private final EventSubmitter eventSubmitter;
    private final HiveUnitUpdateProvider updateProvider;
    private final ConvertibleHiveDatasetFinder datasetFinder;
    private final long maxLookBackTime;
    private final long skipRecentThanTime;
    private final HiveMetastoreClientPool pool;
    private final FileSystem fs;
    private final ExecutorService exec;
    private final List<Future<Void>> futures;
    private final Boolean isNestedORC;
    private final List<String> hiveSettings;
    protected Optional<String> configStoreUri;
    private static final short maxParts = 1000;

    private Map<String, String> successfulConversions;
    private Map<String, String> failedConversions;
    private Map<String, String> warnConversions;
    private Map<String, String> dataValidationFailed;
    private Map<String, String> dataValidationSuccessful;

    public ValidationJob(String jobId, Properties props) throws IOException {
        super(jobId, log);

        // Set the conversion config prefix for Avro to ORC
        props.setProperty(HiveDatasetFinder.HIVE_DATASET_CONFIG_PREFIX_KEY, HIVE_DATASET_CONFIG_AVRO_PREFIX);

        Config config = ConfigFactory.parseProperties(props);
        this.props = props;
        this.metricContext = Instrumented.getMetricContext(ConfigUtils.configToState(config), ValidationJob.class);
        this.eventSubmitter = new EventSubmitter.Builder(this.metricContext, EventConstants.CONVERSION_NAMESPACE)
                .build();
        this.updateProvider = UpdateProviderFactory.create(props);
        this.datasetFinder = new ConvertibleHiveDatasetFinder(getSourceFs(), props, this.eventSubmitter);
        this.fs = FileSystem.get(new Configuration());

        int maxLookBackDays = Integer.parseInt(props.getProperty(HiveSource.HIVE_SOURCE_MAXIMUM_LOOKBACK_DAYS_KEY,
                DEFAULT_HIVE_SOURCE_MAXIMUM_LOOKBACK_DAYS));
        int skipRecentThanDays = Integer.parseInt(props.getProperty(HIVE_SOURCE_SKIP_RECENT_THAN_DAYS_KEY,
                DEFAULT_HIVE_SOURCE_SKIP_RECENT_THAN_DAYS));
        this.maxLookBackTime = new DateTime().minusDays(maxLookBackDays).getMillis();
        this.skipRecentThanTime = new DateTime().minusDays(skipRecentThanDays).getMillis();

        int maxThreadCount = Integer.parseInt(props.getProperty(MAX_THREAD_COUNT, DEFAULT_MAX_THREAD_COUNT));
        this.exec = Executors.newFixedThreadPool(maxThreadCount,
                ExecutorsUtils.newThreadFactory(Optional.of(LoggerFactory.getLogger(ValidationJob.class)),
                        Optional.of("getValidationOutputFromHive")));
        this.futures = Lists.newArrayList();
        EventSubmitter.submit(Optional.of(this.eventSubmitter), EventConstants.VALIDATION_SETUP_EVENT);

        this.pool = HiveMetastoreClientPool.get(props,
                Optional.fromNullable(props.getProperty(HiveDatasetFinder.HIVE_METASTORE_URI_KEY)));
        Preconditions.checkArgument(props.containsKey(VALIDATION_TYPE_KEY),
                "Missing property " + VALIDATION_TYPE_KEY);
        this.validationType = ValidationType.valueOf(props.getProperty(VALIDATION_TYPE_KEY));
        this.ignoreDataPathIdentifierList = COMMA_BASED_SPLITTER
                .splitToList(props.getProperty(HIVE_VALIDATION_IGNORE_DATA_PATH_IDENTIFIER_KEY,
                        DEFAULT_HIVE_VALIDATION_IGNORE_DATA_PATH_IDENTIFIER));
        this.throwables = new ArrayList<>();
        this.isNestedORC = Boolean.parseBoolean(props.getProperty(IS_NESTED_ORC, DEFAULT_IS_NESTED_ORC));
        this.hiveSettings = Splitter.on(";").trimResults().omitEmptyStrings()
                .splitToList(props.getProperty(HIVE_SETTINGS, StringUtils.EMPTY));
    }

    @Override
    public void run() throws Exception {
        if (this.validationType == ValidationType.COUNT_VALIDATION) {
            runCountValidation();
        } else if (this.validationType == ValidationType.FILE_FORMAT_VALIDATION) {
            runFileFormatValidation();
        }
    }

    /**
     * Validates that partitions are in a given format
     * Partitions to be processed are picked up from the config store which are tagged.
     * Tag can be passed through key GOBBLIN_CONFIG_TAGS_WHITELIST
     * Datasets tagged by the above key will be picked up.
     * PathName will be treated as tableName and ParentPathName will be treated as dbName
     *
     * For example if the dataset uri picked up by is /data/hive/myDb/myTable
     * Then myTable is tableName and myDb is dbName
     */
    private void runFileFormatValidation() throws IOException {
        Preconditions.checkArgument(this.props.containsKey(VALIDATION_FILE_FORMAT_KEY));

        this.configStoreUri = StringUtils
                .isNotBlank(this.props.getProperty(ConfigurationKeys.CONFIG_MANAGEMENT_STORE_URI))
                        ? Optional.of(this.props.getProperty(ConfigurationKeys.CONFIG_MANAGEMENT_STORE_URI))
                        : Optional.<String>absent();
        if (!Boolean.valueOf(this.props.getProperty(ConfigurationKeys.CONFIG_MANAGEMENT_STORE_ENABLED,
                ConfigurationKeys.DEFAULT_CONFIG_MANAGEMENT_STORE_ENABLED))) {
            this.configStoreUri = Optional.<String>absent();
        }
        List<Partition> partitions = new ArrayList<>();
        if (this.configStoreUri.isPresent()) {
            Preconditions.checkArgument(this.props.containsKey(GOBBLIN_CONFIG_TAGS_WHITELIST),
                    "Missing required property " + GOBBLIN_CONFIG_TAGS_WHITELIST);
            String tag = this.props.getProperty(GOBBLIN_CONFIG_TAGS_WHITELIST);
            ConfigClient configClient = ConfigClient
                    .createConfigClient(VersionStabilityPolicy.WEAK_LOCAL_STABILITY);
            Path tagUri = PathUtils.mergePaths(new Path(this.configStoreUri.get()), new Path(tag));
            try (AutoReturnableObject<IMetaStoreClient> client = pool.getClient()) {
                Collection<URI> importedBy = configClient.getImportedBy(new URI(tagUri.toString()), true);
                for (URI uri : importedBy) {
                    String dbName = new Path(uri).getParent().getName();
                    Table table = new Table(client.get().getTable(dbName, new Path(uri).getName()));
                    for (org.apache.hadoop.hive.metastore.api.Partition partition : client.get()
                            .listPartitions(dbName, table.getTableName(), maxParts)) {
                        partitions.add(new Partition(table, partition));
                    }
                }
            } catch (Exception e) {
                this.throwables.add(e);
            }
        }

        for (Partition partition : partitions) {
            if (!shouldValidate(partition)) {
                continue;
            }
            String fileFormat = this.props.getProperty(VALIDATION_FILE_FORMAT_KEY);
            Optional<HiveSerDeWrapper.BuiltInHiveSerDe> hiveSerDe = Enums
                    .getIfPresent(HiveSerDeWrapper.BuiltInHiveSerDe.class, fileFormat.toUpperCase());
            if (!hiveSerDe.isPresent()) {
                throwables.add(new Throwable("Partition SerDe is either not supported or absent"));
                continue;
            }

            String serdeLib = partition.getTPartition().getSd().getSerdeInfo().getSerializationLib();
            if (!hiveSerDe.get().toString().equalsIgnoreCase(serdeLib)) {
                throwables.add(new Throwable("Partition " + partition.getCompleteName() + " SerDe " + serdeLib
                        + " doesn't match with the required SerDe " + hiveSerDe.get().toString()));
            }
        }
        if (!this.throwables.isEmpty()) {
            for (Throwable e : this.throwables) {
                log.error("Failed to validate due to " + e);
            }
            throw new RuntimeException("Validation Job Failed");
        }
    }

    private void runCountValidation() throws InterruptedException {
        try {
            // Validation results
            this.successfulConversions = Maps.newConcurrentMap();
            this.failedConversions = Maps.newConcurrentMap();
            this.warnConversions = Maps.newConcurrentMap();
            this.dataValidationFailed = Maps.newConcurrentMap();
            this.dataValidationSuccessful = Maps.newConcurrentMap();

            // Find datasets to validate
            Iterator<HiveDataset> iterator = this.datasetFinder.getDatasetsIterator();
            EventSubmitter.submit(Optional.of(this.eventSubmitter),
                    EventConstants.VALIDATION_FIND_HIVE_TABLES_EVENT);

            while (iterator.hasNext()) {
                ConvertibleHiveDataset hiveDataset = (ConvertibleHiveDataset) iterator.next();
                try (AutoReturnableObject<IMetaStoreClient> client = hiveDataset.getClientPool().getClient()) {

                    // Validate dataset
                    log.info(String.format("Validating dataset: %s", hiveDataset));
                    if (HiveUtils.isPartitioned(hiveDataset.getTable())) {
                        processPartitionedTable(hiveDataset, client);
                    } else {
                        processNonPartitionedTable(hiveDataset);
                    }
                }
            }

            // Wait for all validation queries to finish
            log.info(String.format("Waiting for %d futures to complete", this.futures.size()));

            this.exec.shutdown();
            this.exec.awaitTermination(4, TimeUnit.HOURS);

            boolean oneFutureFailure = false;
            // Check if there were any exceptions
            for (Future<Void> future : this.futures) {
                try {
                    future.get();
                } catch (Throwable t) {
                    log.error("getValidationOutputFromHive failed", t);
                    oneFutureFailure = true;
                }
            }

            // Log validation results:
            // Validation results are consolidated into the successfulConversions and failedConversions
            // These are then converted into log lines in the Azkaban logs as done below
            for (Map.Entry<String, String> successfulConversion : this.successfulConversions.entrySet()) {
                log.info(String.format("Successful conversion: %s [%s]", successfulConversion.getKey(),
                        successfulConversion.getValue()));
            }
            for (Map.Entry<String, String> successfulConversion : this.warnConversions.entrySet()) {
                log.warn(String.format("No conversion found for: %s [%s]", successfulConversion.getKey(),
                        successfulConversion.getValue()));
            }
            for (Map.Entry<String, String> failedConverion : this.failedConversions.entrySet()) {
                log.error(String.format("Failed conversion: %s [%s]", failedConverion.getKey(),
                        failedConverion.getValue()));
            }

            for (Map.Entry<String, String> success : this.dataValidationSuccessful.entrySet()) {
                log.info(
                        String.format("Data validation successful: %s [%s]", success.getKey(), success.getValue()));
            }

            for (Map.Entry<String, String> failed : this.dataValidationFailed.entrySet()) {
                log.error(String.format("Data validation failed: %s [%s]", failed.getKey(), failed.getValue()));
            }

            if (!this.failedConversions.isEmpty() || !this.dataValidationFailed.isEmpty()) {
                throw new RuntimeException(String.format(
                        "Validation failed for %s conversions. See previous logs for exact validation failures",
                        failedConversions.size()));
            }
            if (oneFutureFailure) {
                throw new RuntimeException("At least one hive ddl failed. Check previous logs");
            }

        } catch (IOException e) {
            Throwables.propagate(e);
        }
    }

    /***
     * Validate a {@link Table} if it was updated recently by checking if its update time
     * lies between between maxLookBackTime and skipRecentThanTime window.
     * @param hiveDataset {@link ConvertibleHiveDataset} containing {@link Table} info.
     * @throws IOException Issue in validating {@link HiveDataset}
     */
    private void processNonPartitionedTable(final ConvertibleHiveDataset hiveDataset) throws IOException {
        try {
            // Validate table
            final long updateTime = this.updateProvider.getUpdateTime(hiveDataset.getTable());

            log.info(String.format("Validating table: %s", hiveDataset.getTable()));

            for (final String format : hiveDataset.getDestFormats()) {
                Optional<ConvertibleHiveDataset.ConversionConfig> conversionConfigOptional = hiveDataset
                        .getConversionConfigForFormat(format);
                if (conversionConfigOptional.isPresent()) {
                    ConvertibleHiveDataset.ConversionConfig conversionConfig = conversionConfigOptional.get();
                    String orcTableName = conversionConfig.getDestinationTableName();
                    String orcTableDatabase = conversionConfig.getDestinationDbName();
                    Pair<Optional<org.apache.hadoop.hive.metastore.api.Table>, Optional<List<Partition>>> destinationMeta = HiveConverterUtils
                            .getDestinationTableMeta(orcTableDatabase, orcTableName, this.props);

                    // Generate validation queries
                    final List<String> validationQueries = HiveValidationQueryGenerator
                            .generateCountValidationQueries(hiveDataset, Optional.<Partition>absent(),
                                    conversionConfig);
                    final List<String> dataValidationQueries = Lists.newArrayList(HiveValidationQueryGenerator
                            .generateDataValidationQuery(hiveDataset.getTable().getTableName(),
                                    hiveDataset.getTable().getDbName(), destinationMeta.getKey().get(),
                                    Optional.<Partition>absent(), this.isNestedORC));

                    this.futures.add(this.exec.submit(new Callable<Void>() {
                        @Override
                        public Void call() throws Exception {

                            // Execute validation queries
                            log.debug(String.format("Going to execute queries: %s for format: %s",
                                    validationQueries, format));
                            List<Long> rowCounts = ValidationJob.this
                                    .getValidationOutputFromHive(validationQueries);
                            log.debug(String.format("Going to execute queries: %s for format: %s",
                                    dataValidationQueries, format));
                            List<Long> rowDataValidatedCount = ValidationJob.this
                                    .getValidationOutputFromHive(dataValidationQueries);
                            // Validate and populate report
                            validateAndPopulateReport(hiveDataset.getTable().getCompleteName(), updateTime,
                                    rowCounts, rowDataValidatedCount);

                            return null;
                        }
                    }));
                } else {
                    log.warn(String.format("No config found for format: %s So skipping table: %s for this format",
                            format, hiveDataset.getTable().getCompleteName()));
                }
            }
        } catch (UncheckedExecutionException e) {
            log.warn(String.format("Not validating table: %s %s", hiveDataset.getTable().getCompleteName(),
                    e.getMessage()));
        } catch (UpdateNotFoundException e) {
            log.warn(String.format("Not validating table: %s as update time was not found. %s",
                    hiveDataset.getTable().getCompleteName(), e.getMessage()));
        }
    }

    /***
     * Validate all {@link Partition}s for a {@link Table} if it was updated recently by checking if its update time
     * lies between between maxLookBackTime and skipRecentThanTime window.
     * @param hiveDataset {@link HiveDataset} containing {@link Table} and {@link Partition} info.
     * @param client {@link IMetaStoreClient} to query Hive.
     * @throws IOException Issue in validating {@link HiveDataset}
     */
    private void processPartitionedTable(ConvertibleHiveDataset hiveDataset,
            AutoReturnableObject<IMetaStoreClient> client) throws IOException {

        // Get partitions for the table
        List<Partition> sourcePartitions = HiveUtils.getPartitions(client.get(), hiveDataset.getTable(),
                Optional.<String>absent());

        for (final String format : hiveDataset.getDestFormats()) {
            Optional<ConvertibleHiveDataset.ConversionConfig> conversionConfigOptional = hiveDataset
                    .getConversionConfigForFormat(format);

            if (conversionConfigOptional.isPresent()) {

                // Get conversion config
                ConvertibleHiveDataset.ConversionConfig conversionConfig = conversionConfigOptional.get();
                String orcTableName = conversionConfig.getDestinationTableName();
                String orcTableDatabase = conversionConfig.getDestinationDbName();
                Pair<Optional<org.apache.hadoop.hive.metastore.api.Table>, Optional<List<Partition>>> destinationMeta = HiveConverterUtils
                        .getDestinationTableMeta(orcTableDatabase, orcTableName, this.props);

                // Validate each partition
                for (final Partition sourcePartition : sourcePartitions) {
                    try {
                        final long updateTime = this.updateProvider.getUpdateTime(sourcePartition);
                        if (shouldValidate(sourcePartition)) {
                            log.info(String.format("Validating partition: %s", sourcePartition.getCompleteName()));

                            // Generate validation queries
                            final List<String> countValidationQueries = HiveValidationQueryGenerator
                                    .generateCountValidationQueries(hiveDataset, Optional.of(sourcePartition),
                                            conversionConfig);
                            final List<String> dataValidationQueries = Lists
                                    .newArrayList(HiveValidationQueryGenerator.generateDataValidationQuery(
                                            hiveDataset.getTable().getTableName(),
                                            hiveDataset.getTable().getDbName(), destinationMeta.getKey().get(),
                                            Optional.of(sourcePartition), this.isNestedORC));

                            this.futures.add(this.exec.submit(new Callable<Void>() {
                                @Override
                                public Void call() throws Exception {

                                    // Execute validation queries
                                    log.debug(String.format(
                                            "Going to execute count validation queries queries: %s for format: %s "
                                                    + "and partition %s",
                                            countValidationQueries, format, sourcePartition.getCompleteName()));
                                    List<Long> rowCounts = ValidationJob.this
                                            .getValidationOutputFromHive(countValidationQueries);
                                    log.debug(String.format(
                                            "Going to execute data validation queries: %s for format: %s and partition %s",
                                            dataValidationQueries, format, sourcePartition.getCompleteName()));
                                    List<Long> rowDataValidatedCount = ValidationJob.this
                                            .getValidationOutputFromHive(dataValidationQueries);

                                    // Validate and populate report
                                    validateAndPopulateReport(sourcePartition.getCompleteName(), updateTime,
                                            rowCounts, rowDataValidatedCount);

                                    return null;
                                }
                            }));

                        } else {
                            log.debug(String.format(
                                    "Not validating partition: %s as updateTime: %s is not in range of max look back: %s "
                                            + "and skip recent than: %s",
                                    sourcePartition.getCompleteName(), updateTime, this.maxLookBackTime,
                                    this.skipRecentThanTime));
                        }
                    } catch (UncheckedExecutionException e) {
                        log.warn(String.format("Not validating partition: %s %s", sourcePartition.getCompleteName(),
                                e.getMessage()));
                    } catch (UpdateNotFoundException e) {
                        log.warn(String.format("Not validating partition: %s as update time was not found. %s",
                                sourcePartition.getCompleteName(), e.getMessage()));
                    }
                }
            } else {
                log.info(String.format("No conversion config found for format %s. Ignoring data validation",
                        format));
            }
        }
    }

    /***
     * Execute Hive queries using {@link HiveJdbcConnector} and validate results.
     * @param queries Queries to execute.
     */
    @SuppressWarnings("unused")
    private List<Long> getValidationOutputFromHiveJdbc(List<String> queries) throws IOException {
        if (null == queries || queries.size() == 0) {
            log.warn("No queries specified to be executed");
            return Collections.emptyList();
        }
        Statement statement = null;
        List<Long> rowCounts = Lists.newArrayList();
        Closer closer = Closer.create();

        try {
            HiveJdbcConnector hiveJdbcConnector = HiveJdbcConnector.newConnectorWithProps(props);
            statement = hiveJdbcConnector.getConnection().createStatement();

            for (String query : queries) {
                log.info("Executing query: " + query);
                boolean result = statement.execute(query);
                if (result) {
                    ResultSet resultSet = statement.getResultSet();
                    if (resultSet.next()) {
                        rowCounts.add(resultSet.getLong(1));
                    }
                } else {
                    log.warn("Query output for: " + query + " : " + result);
                }
            }

        } catch (SQLException e) {
            throw new RuntimeException(e);
        } finally {
            try {
                closer.close();
            } catch (Exception e) {
                log.warn("Could not close HiveJdbcConnector", e);
            }
            if (null != statement) {
                try {
                    statement.close();
                } catch (SQLException e) {
                    log.warn("Could not close Hive statement", e);
                }
            }
        }

        return rowCounts;
    }

    /***
     * Execute Hive queries using {@link HiveJdbcConnector} and validate results.
     * @param queries Queries to execute.
     */
    @edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "SQL_NONCONSTANT_STRING_PASSED_TO_EXECUTE", justification = "Temporary fix")
    private List<Long> getValidationOutputFromHive(List<String> queries) throws IOException {

        if (null == queries || queries.size() == 0) {
            log.warn("No queries specified to be executed");
            return Collections.emptyList();
        }

        List<Long> rowCounts = Lists.newArrayList();
        Closer closer = Closer.create();

        try {
            HiveJdbcConnector hiveJdbcConnector = closer.register(HiveJdbcConnector.newConnectorWithProps(props));
            for (String query : queries) {
                String hiveOutput = "hiveConversionValidationOutput_" + UUID.randomUUID().toString();
                Path hiveTempDir = new Path("/tmp" + Path.SEPARATOR + hiveOutput);
                query = "INSERT OVERWRITE DIRECTORY '" + hiveTempDir + "' " + query;
                log.info("Executing query: " + query);
                try {
                    if (this.hiveSettings.size() > 0) {
                        hiveJdbcConnector
                                .executeStatements(this.hiveSettings.toArray(new String[this.hiveSettings.size()]));
                    }
                    hiveJdbcConnector.executeStatements("SET hive.exec.compress.output=false",
                            "SET hive.auto.convert.join=false", query);
                    FileStatus[] fileStatusList = this.fs.listStatus(hiveTempDir);
                    List<FileStatus> files = new ArrayList<>();
                    for (FileStatus fileStatus : fileStatusList) {
                        if (fileStatus.isFile()) {
                            files.add(fileStatus);
                        }
                    }
                    if (files.size() > 1) {
                        log.warn("Found more than one output file. Should have been one.");
                    } else if (files.size() == 0) {
                        log.warn("Found no output file. Should have been one.");
                    } else {
                        String theString = IOUtils.toString(
                                new InputStreamReader(this.fs.open(files.get(0).getPath()), Charsets.UTF_8));
                        log.info("Found row count: " + theString.trim());
                        if (StringUtils.isBlank(theString.trim())) {
                            rowCounts.add(0l);
                        } else {
                            try {
                                rowCounts.add(Long.parseLong(theString.trim()));
                            } catch (NumberFormatException e) {
                                throw new RuntimeException("Could not parse Hive output: " + theString.trim(), e);
                            }
                        }
                    }
                } finally {
                    if (this.fs.exists(hiveTempDir)) {
                        log.debug("Deleting temp dir: " + hiveTempDir);
                        this.fs.delete(hiveTempDir, true);
                    }
                }
            }
        } catch (SQLException e) {
            log.warn("Execution failed for query set " + queries.toString(), e);
        } finally {
            try {
                closer.close();
            } catch (Exception e) {
                log.warn("Could not close HiveJdbcConnector", e);
            }
        }

        return rowCounts;
    }

    private void validateAndPopulateReport(String datasetIdentifier, long conversionInstance, List<Long> rowCounts,
            List<Long> rowDataValidatedCount) {
        if (null == rowCounts || rowCounts.size() == 0) {
            this.warnConversions.put(
                    String.format("Dataset: %s Instance: %s", datasetIdentifier, conversionInstance),
                    "No conversion details found");
            this.eventSubmitter.submit(EventConstants.VALIDATION_NOOP_EVENT,
                    ImmutableMap.of("datasetUrn", datasetIdentifier));
            return;
        }
        if (null == rowDataValidatedCount || rowDataValidatedCount.size() == 0) {
            this.warnConversions.put(
                    String.format("Dataset: %s Instance: %s", datasetIdentifier, conversionInstance),
                    "No conversion details found");
            this.eventSubmitter.submit(EventConstants.VALIDATION_NOOP_EVENT,
                    ImmutableMap.of("datasetUrn", datasetIdentifier));
            return;
        }
        long rowCountCached = -1;
        boolean isFirst = true;
        for (Long rowCount : rowCounts) {
            // First is always source partition / table (refer HiveValidationQueryGenerator)
            if (isFirst) {
                rowCountCached = rowCount;
                isFirst = false;
                continue;
            }

            // Row count validation
            if (rowCount != rowCountCached) {
                if (rowCount == 0) {
                    this.warnConversions.put(
                            String.format("Dataset: %s Instance: %s", datasetIdentifier, conversionInstance),
                            "Row counts found 0, may be the conversion is delayed.");
                    this.eventSubmitter.submit(EventConstants.VALIDATION_NOOP_EVENT,
                            ImmutableMap.of("datasetUrn", datasetIdentifier));
                } else {
                    this.failedConversions.put(
                            String.format("Dataset: %s Instance: %s", datasetIdentifier, conversionInstance),
                            String.format(
                                    "Row counts did not match across all conversions. Row count expected: %d, Row count got: %d",
                                    rowCountCached, rowCount));
                    this.eventSubmitter.submit(EventConstants.VALIDATION_FAILED_EVENT,
                            ImmutableMap.of("datasetUrn", datasetIdentifier));
                    return;
                }
            } else {
                this.successfulConversions.put(
                        String.format("Dataset: %s Instance: %s", datasetIdentifier, conversionInstance),
                        String.format(
                                "Row counts matched across all conversions. Row count expected: %d, Row count got: %d",
                                rowCountCached, rowCount));
                this.eventSubmitter.submit(EventConstants.VALIDATION_SUCCESSFUL_EVENT,
                        ImmutableMap.of("datasetUrn", datasetIdentifier));
            }
        }

        // Data count validation
        if (rowCountCached == rowDataValidatedCount.get(0)) {
            this.dataValidationSuccessful.put(
                    String.format("Dataset: %s Instance: %s", datasetIdentifier, conversionInstance),
                    "Common rows matched expected value. Expected: " + rowCountCached + " Found: "
                            + rowDataValidatedCount);
        } else {
            this.dataValidationFailed.put(
                    String.format("Dataset: %s Instance: %s", datasetIdentifier, conversionInstance),
                    "Common rows did not match expected value. Expected: " + rowCountCached + " Found: "
                            + rowDataValidatedCount);
        }
    }

    /***
     * Get source {@link FileSystem}
     * @return Source {@link FileSystem}
     * @throws IOException Issue in fetching {@link FileSystem}
     */
    private static FileSystem getSourceFs() throws IOException {
        return FileSystem.get(HadoopUtils.newConfiguration());
    }

    /**
     * Determine if the {@link Table} or {@link Partition} should be validated by checking if its create time
     * lies between maxLookBackTime and skipRecentThanTime window.
     */
    private boolean shouldValidate(Partition partition) {
        for (String pathToken : this.ignoreDataPathIdentifierList) {
            if (partition.getDataLocation().toString().toLowerCase().contains(pathToken.toLowerCase())) {
                log.info("Skipping partition " + partition.getCompleteName() + " containing invalid token "
                        + pathToken.toLowerCase());
                return false;
            }
        }

        try {
            long createTime = getPartitionCreateTime(partition.getName());
            boolean withinTimeWindow = new DateTime(createTime).isAfter(this.maxLookBackTime)
                    && new DateTime(createTime).isBefore(this.skipRecentThanTime);
            if (!withinTimeWindow) {
                log.info("Skipping partition " + partition.getCompleteName() + " as create time "
                        + new DateTime(createTime).toString() + " is not within validation time window ");
            } else {
                log.info("Validating partition " + partition.getCompleteName());
                return withinTimeWindow;
            }
        } catch (ParseException e) {
            Throwables.propagate(e);
        }
        return false;
    }

    public static Long getPartitionCreateTime(String partitionName) throws ParseException {
        String dateString = null;
        for (String st : SLASH_SPLITTER.splitToList(partitionName)) {
            if (st.startsWith(DATEPARTITION)) {
                dateString = EQUALITY_SPLITTER.splitToList(st).get(1);
            }
        }
        Preconditions.checkNotNull(dateString, "Unable to get partition date");
        DateFormat dateFormat = new SimpleDateFormat(DATE_FORMAT);
        return dateFormat.parse(dateString).getTime();
    }
}

enum ValidationType {
    COUNT_VALIDATION, FILE_FORMAT_VALIDATION;

    ValidationType() {
    }
}