com.thinkbiganalytics.spark.dataprofiler.core.Profiler.java Source code

Java tutorial

Introduction

Here is the source code for com.thinkbiganalytics.spark.dataprofiler.core.Profiler.java

Source

package com.thinkbiganalytics.spark.dataprofiler.core;

/*-
 * #%L
 * thinkbig-spark-job-profiler-app
 * %%
 * Copyright (C) 2017 ThinkBig Analytics
 * %%
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *     http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * #L%
 */

import com.thinkbiganalytics.hive.util.HiveUtils;
import com.thinkbiganalytics.policy.FieldPolicy;
import com.thinkbiganalytics.spark.DataSet;
import com.thinkbiganalytics.spark.SparkContextService;
import com.thinkbiganalytics.spark.dataprofiler.ProfilerConfiguration;
import com.thinkbiganalytics.spark.dataprofiler.StatisticsModel;
import com.thinkbiganalytics.spark.dataprofiler.output.OutputWriter;
import com.thinkbiganalytics.spark.policy.FieldPolicyLoader;

import org.apache.commons.lang.StringUtils;
import org.apache.spark.sql.SQLContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.context.ApplicationContext;
import org.springframework.context.annotation.AnnotationConfigApplicationContext;

import java.util.ArrayList;
import java.util.List;
import java.util.Map;

import javax.annotation.Nonnull;
import javax.annotation.Nullable;

/**
 * Generate data profile statistics for a table/query, and write result to a table
 */
public class Profiler {

    private static final Logger log = LoggerFactory.getLogger(Profiler.class);

    private FieldPolicyLoader loader;

    private com.thinkbiganalytics.spark.dataprofiler.Profiler profiler;

    private ProfilerConfiguration profilerConfiguration;

    private SparkContextService sparkContextService;

    private SQLContext sqlContext;

    /**
     * Main entry point into program
     *
     * @param args: list of args
     */
    public static void main(String[] args) {
        final ApplicationContext ctx = new AnnotationConfigApplicationContext("com.thinkbiganalytics.spark");
        final Profiler profiler = new Profiler(ctx.getBean(FieldPolicyLoader.class),
                ctx.getBean(com.thinkbiganalytics.spark.dataprofiler.Profiler.class),
                ctx.getBean(ProfilerConfiguration.class), ctx.getBean(SparkContextService.class),
                ctx.getBean(SQLContext.class));
        profiler.run(args);
    }

    public Profiler(FieldPolicyLoader loader, com.thinkbiganalytics.spark.dataprofiler.Profiler profiler,
            ProfilerConfiguration profilerConfiguration, SparkContextService sparkContextService,
            SQLContext sqlContext) {
        this.loader = loader;
        this.profiler = profiler;
        this.profilerConfiguration = profilerConfiguration;
        this.sparkContextService = sparkContextService;
        this.sqlContext = sqlContext;
    }

    public void run(String[] args) {
        /* Variables */
        DataSet resultDF;
        String queryString;

        /* Check command line arguments and get query to run. */
        if ((queryString = checkCommandLineArgs(args)) == null) {
            return;
        }

        /* Run query and get result */
        log.info("[PROFILER-INFO] Analyzing profile statistics for: [{}]", queryString);
        resultDF = sparkContextService.sql(sqlContext, queryString);

        /* Get profile statistics and write to table */
        final StatisticsModel statisticsModel = profiler.profile(resultDF, profilerConfiguration);

        if (statisticsModel != null) {
            OutputWriter.writeModel(statisticsModel, profilerConfiguration, sqlContext, sparkContextService);
        } else {
            log.info("[PROFILER-INFO] No data to process. Hence, no profile statistics generated.");
        }

        /* Wrap up */
        log.info("[PROFILER-INFO] Profiling finished.");
    }

    /**
     * Check command line arguments
     *
     * @param args list of command line arguments
     * @return query to run (null if invalid arguments)
     */
    @Nullable
    private String checkCommandLineArgs(final String[] args) {
        if (log.isInfoEnabled()) {
            log.info("Running Spark Profiler with the following command line {} args (comma separated): {}",
                    args.length, StringUtils.join(args, ","));
        }

        if (args.length < 5) {
            log.error("Invalid number of command line arguments ({})", args.length);
            showCommandLineArgs();
            return null;
        }

        String retVal;

        String profileObjectType = args[0];
        String profileObjectDesc = args[1];
        Integer n = Integer.valueOf(args[2]);
        String profileOutputTable = args[3];
        String fieldPolicyJsonPath = args[4];
        Map<String, FieldPolicy> policyMap = loader.loadFieldPolicy(fieldPolicyJsonPath);

        String inputAndOutputTablePartitionKey = "ALL";

        if (args.length >= 6) {
            inputAndOutputTablePartitionKey = args[5];
        }

        switch (profileObjectType) {
        case "table":
            // Quote source table
            final String[] tableRef = profileObjectDesc.split("\\.", 2);
            final String safeTable = tableRef.length == 1 ? HiveUtils.quoteIdentifier(tableRef[0])
                    : HiveUtils.quoteIdentifier(tableRef[0], tableRef[1]);

            // Create SQL
            List<String> profiledColumns = new ArrayList<>();
            for (FieldPolicy fieldPolicy : policyMap.values()) {
                if (fieldPolicy.isProfile()) {
                    profiledColumns.add(HiveUtils.quoteIdentifier(fieldPolicy.getField().toLowerCase()));
                }
            }

            if (!profiledColumns.isEmpty()) {
                retVal = "select " + StringUtils.join(profiledColumns, ',') + " from " + safeTable;
                if (inputAndOutputTablePartitionKey != null
                        && !"ALL".equalsIgnoreCase(inputAndOutputTablePartitionKey)) {
                    retVal += " where "
                            + HiveUtils.quoteIdentifier(profilerConfiguration.getInputTablePartitionColumnName())
                            + " = " + HiveUtils.quoteString(inputAndOutputTablePartitionKey);
                }
            } else {
                retVal = null;
            }
            break;
        case "query":
            retVal = profileObjectDesc;
            break;
        default:
            log.error("Illegal command line argument for object type ({})", profileObjectType);
            showCommandLineArgs();
            return null;
        }

        if (n <= 0) {
            log.error("Illegal command line argument for n for top_n values ({})", n);
            showCommandLineArgs();
            return null;
        } else {
            profilerConfiguration.setNumberOfTopNValues(n);
        }

        if (!setOutputTableDBAndName(profileOutputTable, profilerConfiguration)) {
            log.error("Illegal command line argument for output table ({})", profileOutputTable);
            showCommandLineArgs();
            return null;
        }

        profilerConfiguration.setInputAndOutputTablePartitionKey(inputAndOutputTablePartitionKey);

        return retVal;
    }

    /*
     * Set output database and table
     */
    private boolean setOutputTableDBAndName(@Nonnull final String profileOutputTable,
            @Nonnull final ProfilerConfiguration profilerConfiguration) {

        Boolean retVal = true;
        String[] tableNameParts = profileOutputTable.split("\\.");

        if (tableNameParts.length == 1) {
            //output db remains as 'default'
            profilerConfiguration.setOutputTableName(tableNameParts[0]);
        } else if (tableNameParts.length == 2) {
            profilerConfiguration.setOutputDbName(tableNameParts[0]);
            profilerConfiguration.setOutputTableName(tableNameParts[1]);
        } else {
            retVal = false;
        }

        return retVal;
    }

    /**
     * Show required command-line arguments.
     */
    private void showCommandLineArgs() {
        log.info("*** \nInfo: Required command line arguments:\n"
                + "1. object type: valid values are {table, query}\n"
                + "2. object description: valid values are {<database.table>, <query>}\n"
                + "3. n for top_n values: valid value is {<integer>}\n"
                + "4. output table: valid values are {<table>, <database.table>}" + "5. full path to policy file "
                + "\n" + "Info: Optional command line argument:\n"
                + "6. partition_key: valid value is {<string>}\n\n"
                + "(Note: Only alphanumeric and underscore characters for table names and partition key)"
                + "\n***");
    }
}