cascading.tap.hive.HiveTableDescriptor.java Source code

Introduction

Here is the source code for cascading.tap.hive.HiveTableDescriptor.java
Source

/*
* Copyright (c) 2007-2014 Concurrent, Inc. All Rights Reserved.
*
* Project and contact information: http://www.cascading.org/
*
* This file is part of the Cascading project.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package cascading.tap.hive;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;

import cascading.CascadingException;
import cascading.scheme.Scheme;
import cascading.scheme.hadoop.TextDelimited;
import cascading.tap.partition.Partition;
import cascading.tuple.Fields;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.metastore.MetaStoreUtils;
import org.apache.hadoop.hive.metastore.TableType;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.hadoop.hive.metastore.api.SerDeInfo;
import org.apache.hadoop.hive.metastore.api.StorageDescriptor;
import org.apache.hadoop.hive.metastore.api.Table;

/**
 * HiveTableDescriptor encapsulates information about a table in Hive like the table name, column names, types,
 * partitioning etc. The class can convert the information to Hive specific objects or Cascading specific objects. It
 * acts as a translator of the concepts of a Hive table and the concepts of a Cascading Tap/Scheme.
 */
public class HiveTableDescriptor implements Serializable {

    /** default DB in Hive. */
    public final static String HIVE_DEFAULT_DATABASE_NAME = MetaStoreUtils.DEFAULT_DATABASE_NAME;

    /** default delimiter in hive tables */
    public static final String HIVE_DEFAULT_DELIMITER = "\1";

    /** default input format used by Hive */
    public static final String HIVE_DEFAULT_INPUT_FORMAT_NAME = "org.apache.hadoop.mapred.TextInputFormat";

    /** default output format used by Hive */
    public static final String HIVE_DEFAULT_OUTPUT_FORMAT_NAME = "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat";

    /** default serialization lib name */
    public static final String HIVE_DEFAULT_SERIALIZATION_LIB_NAME = HiveConf.ConfVars.HIVESCRIPTSERDE.defaultVal;

    /** columns to be used for partitioning */
    private String[] partitionKeys;

    /** field delimiter in the Hive table */
    private String delimiter;

    /** name of the hive table */
    private String tableName;

    /** name of the database */
    private String databaseName;

    /** names of the columns */
    private String[] columnNames;

    /** hive column types */
    private String[] columnTypes;

    /** Hive serialization library */
    private String serializationLib;

    /** Optional alternate location of the table */
    private String location = null;

    /**
     * Constructs a new HiveTableDescriptor object.
     *
     * @param tableName   The table name.
     * @param columnNames Names of the columns.
     * @param columnTypes Hive types of the columns.
     */
    public HiveTableDescriptor(String tableName, String[] columnNames, String[] columnTypes) {
        this(HIVE_DEFAULT_DATABASE_NAME, tableName, columnNames, columnTypes, new String[] {},
                HIVE_DEFAULT_DELIMITER, HIVE_DEFAULT_SERIALIZATION_LIB_NAME, null);
    }

    /**
     * Constructs a new HiveTableDescriptor object.
     *
     * @param tableName   The table name.
     * @param columnNames Names of the columns.
     * @param columnTypes Hive types of the columns.
     * @param partitionKeys The keys for partitioning the table.
     */
    public HiveTableDescriptor(String tableName, String[] columnNames, String[] columnTypes,
            String[] partitionKeys) {
        this(HIVE_DEFAULT_DATABASE_NAME, tableName, columnNames, columnTypes, partitionKeys, HIVE_DEFAULT_DELIMITER,
                HIVE_DEFAULT_SERIALIZATION_LIB_NAME, null);
    }

    /**
     * Constructs a new HiveTableDescriptor object.
     *
     * @param tableName   The table name.
     * @param columnNames Names of the columns.
     * @param columnTypes Hive types of the columns.
     * @param partitionKeys The keys for partitioning the table.
     * @param delimiter   The field delimiter of the Hive table.
     *
     */
    public HiveTableDescriptor(String tableName, String[] columnNames, String[] columnTypes, String[] partitionKeys,
            String delimiter) {
        this(HIVE_DEFAULT_DATABASE_NAME, tableName, columnNames, columnTypes, partitionKeys, delimiter,
                HIVE_DEFAULT_SERIALIZATION_LIB_NAME, null);
    }

    /**
     * Constructs a new HiveTableDescriptor object.
     *
     * @param databaseName The database name.
     * @param tableName   The table name.
     * @param columnNames Names of the columns.
     * @param columnTypes Hive types of the columns.
     */
    public HiveTableDescriptor(String databaseName, String tableName, String[] columnNames, String[] columnTypes) {
        this(databaseName, tableName, columnNames, columnTypes, new String[] {}, HIVE_DEFAULT_DELIMITER,
                HIVE_DEFAULT_SERIALIZATION_LIB_NAME, null);
    }

    /**
     * Constructs a new HiveTableDescriptor object.
     *
     * @param databaseName The database name.
     * @param tableName   The table name.
     * @param columnNames Names of the columns.
     * @param columnTypes Hive types of the columns.
     * @param partitionKeys The keys for partitioning the table.
     */
    public HiveTableDescriptor(String databaseName, String tableName, String[] columnNames, String[] columnTypes,
            String[] partitionKeys) {
        this(databaseName, tableName, columnNames, columnTypes, partitionKeys, HIVE_DEFAULT_DELIMITER,
                HIVE_DEFAULT_SERIALIZATION_LIB_NAME, null);
    }

    /**
     * Constructs a new HiveTableDescriptor object.
     *
     * @param databaseName     The database name.
     * @param tableName   The table name.
     * @param columnNames Names of the columns.
     * @param columnTypes Hive types of the columns.
     * @param partitionKeys The keys for partitioning the table.
     * @param delimiter   The field delimiter of the Hive table.
     *
     */
    public HiveTableDescriptor(String databaseName, String tableName, String[] columnNames, String[] columnTypes,
            String[] partitionKeys, String delimiter) {
        this(databaseName, tableName, columnNames, columnTypes, partitionKeys, delimiter,
                HIVE_DEFAULT_SERIALIZATION_LIB_NAME, null);
    }

    /**
     * Constructs a new HiveTableDescriptor object.
     *
     * @param databaseName     The database name.
     * @param tableName        The table name
     * @param columnNames      Names of the columns
     * @param columnTypes      Hive types of the columns
     * @param delimiter        The field delimiter of the Hive table
     * @param serializationLib Hive serialization library.
     */
    public HiveTableDescriptor(String databaseName, String tableName, String[] columnNames, String[] columnTypes,
            String[] partitionKeys, String delimiter, String serializationLib, Path location) {
        if (tableName == null || tableName.isEmpty())
            throw new IllegalArgumentException("tableName cannot be null or empty");
        if (databaseName == null || tableName.isEmpty())
            this.databaseName = HIVE_DEFAULT_DATABASE_NAME;
        else
            this.databaseName = databaseName.toLowerCase();
        this.tableName = tableName.toLowerCase();
        this.columnNames = columnNames;
        this.columnTypes = columnTypes;
        this.partitionKeys = partitionKeys;
        this.serializationLib = serializationLib;
        //Only set the delimiter if the serialization lib is Delimited.
        if (delimiter == null && this.serializationLib == HIVE_DEFAULT_SERIALIZATION_LIB_NAME)
            this.delimiter = HIVE_DEFAULT_DELIMITER;
        else
            this.delimiter = delimiter;
        if (isPartitioned())
            verifyPartitionKeys();
        if (columnNames.length == 0 || columnTypes.length == 0 || columnNames.length != columnTypes.length)
            throw new IllegalArgumentException(
                    "columnNames and columnTypes cannot be empty and must have the same size");

        if (location != null) {
            if (!location.isAbsolute())
                throw new IllegalArgumentException("location must be a fully qualified absolute path");

            // Store as string since path is not serialisable
            this.location = location.toString();
        }
    }

    /**
     * Private method to verify that all partition keys are also listed as column keys.
     */
    private void verifyPartitionKeys() {
        for (int index = 0; index < partitionKeys.length; index++) {
            String key = partitionKeys[index];
            if (!caseInsensitiveContains(columnNames, key))
                throw new IllegalArgumentException(
                        String.format("Given partition key '%s' not present in column names", key));
        }
    }

    /**
     * Converts the instance to a Hive Table object, which can be used with the MetaStore API.
     *
     * @return a new Table instance.
     */
    public Table toHiveTable() {
        Table table = new Table();
        table.setDbName(getDatabaseName());
        table.setTableName(tableName);
        table.setTableType(TableType.MANAGED_TABLE.toString());

        StorageDescriptor sd = new StorageDescriptor();
        for (int index = 0; index < columnNames.length; index++) {
            String columnName = columnNames[index];
            if (!caseInsensitiveContains(partitionKeys, columnName))
                // calling toLowerCase() on the type to match the behaviour of the hive console
                sd.addToCols(new FieldSchema(columnName, columnTypes[index].toLowerCase(), "created by Cascading"));
        }
        SerDeInfo serDeInfo = new SerDeInfo();
        serDeInfo.setSerializationLib(serializationLib);
        Map<String, String> serDeParameters = new HashMap<String, String>();

        if (getDelimiter() != null) {
            serDeParameters.put("serialization.format", getDelimiter());
            serDeParameters.put("field.delim", getDelimiter());
        } else {
            serDeParameters.put("serialization.format", "1");
        }
        serDeInfo.setParameters(serDeParameters);

        sd.setSerdeInfo(serDeInfo);
        sd.setInputFormat(HIVE_DEFAULT_INPUT_FORMAT_NAME);
        sd.setOutputFormat(HIVE_DEFAULT_OUTPUT_FORMAT_NAME);

        if (location != null) {
            table.setTableType(TableType.EXTERNAL_TABLE.toString());
            // Need to set this as well since setting the table type would be too obvious
            table.putToParameters("EXTERNAL", "TRUE");
            sd.setLocation(location.toString());
        }

        table.setSd(sd);

        if (isPartitioned()) {
            table.setPartitionKeys(createPartitionSchema());
            table.setPartitionKeysIsSet(true);
        }

        return table;
    }

    /**
     * Creates a List of FieldSchema instances representing the partitions of the Hive Table.
     * @return a List of FieldSchema instances.
     */
    private List<FieldSchema> createPartitionSchema() {
        List<String> names = Arrays.asList(columnNames);
        List<FieldSchema> schema = new LinkedList<FieldSchema>();
        for (int i = 0; i < partitionKeys.length; i++) {
            int index = names.indexOf(partitionKeys[i]);
            schema.add(new FieldSchema(columnNames[index], columnTypes[index], ""));
        }
        return schema;
    }

    /**
     * Returns a new Partition object to be used with a HivePartitionTap. If the table is not partitioned the method
     * will return null.
     * @return a new partition object or null.
     */
    public Partition getPartition() {
        if (isPartitioned())
            return new HivePartition(new Fields(getPartitionKeys()));
        throw new CascadingException("non partitioned table cannot be used in a partitioned context");
    }

    /**
     * Converts the HiveTableDescriptor to a Fields instance. If the table is partitioned only the columns not
     * part of the partitioning will be returned.
     * @return A Fields instance.
     */
    public Fields toFields() {
        if (!isPartitioned())
            return new Fields(columnNames);

        List<String> names = new ArrayList<String>(Arrays.asList(columnNames));
        names.removeAll(Arrays.asList(getPartitionKeys()));

        Comparable[] comparables = new Comparable[names.size()];
        return new Fields((Comparable[]) names.toArray(comparables));
    }

    /**
     * Returns the path of the table within the warehouse directory.
     * @return The path of the table within the warehouse directory.
     */
    public String getLocation(String warehousePath) {
        if (location != null)
            return location.toString();
        else if (getDatabaseName().equals(HIVE_DEFAULT_DATABASE_NAME))
            return String.format("%s/%s", warehousePath, getTableName());
        else
            return String.format("%s/%s.db/%s", warehousePath, getDatabaseName(), getTableName());
    }

    /**
     * Converts the HiveTableDescriptor to a Scheme instance based on the information available.
     *
     * @return a new Scheme instance.
     */
    public Scheme toScheme() {
        // TODO add smarts to return the right thing.
        Scheme scheme = new TextDelimited(false, getDelimiter());
        scheme.setSinkFields(toFields());
        return scheme;
    }

    public String[] getColumnNames() {
        return columnNames;
    }

    public String[] getColumnTypes() {
        return columnTypes;
    }

    public String getTableName() {
        return tableName;
    }

    public String getDatabaseName() {
        return databaseName;
    }

    public String getDelimiter() {
        return delimiter;
    }

    public String[] getPartitionKeys() {
        return partitionKeys;
    }

    public boolean isPartitioned() {
        return partitionKeys != null && partitionKeys.length > 0;
    }

    @Override
    public boolean equals(Object object) {
        if (this == object)
            return true;
        if (object == null || getClass() != object.getClass())
            return false;

        HiveTableDescriptor that = (HiveTableDescriptor) object;

        if (!arraysEqualCaseInsensitive(columnNames, that.columnNames))
            return false;
        if (!arraysEqualCaseInsensitive(columnTypes, that.columnTypes))
            return false;
        if (databaseName != null ? !databaseName.equalsIgnoreCase(that.databaseName) : that.databaseName != null)
            return false;
        if (delimiter != null ? !delimiter.equals(that.delimiter) : that.delimiter != null)
            return false;
        if (!arraysEqualCaseInsensitive(partitionKeys, that.partitionKeys))
            return false;
        if (serializationLib != null ? !serializationLib.equals(that.serializationLib)
                : that.serializationLib != null)
            return false;
        if (tableName != null ? !tableName.equalsIgnoreCase(that.tableName) : that.tableName != null)
            return false;
        if (location != null ? !location.equals(that.location) : that.location != null)
            return false;

        return true;
    }

    @Override
    public int hashCode() {
        int result = partitionKeys != null ? arraysHashCodeCaseInsensitive(partitionKeys) : 0;
        result = 31 * result + (delimiter != null ? delimiter.hashCode() : 0);
        result = 31 * result + (tableName != null ? tableName.toLowerCase().hashCode() : 0);
        result = 31 * result + (databaseName != null ? databaseName.toLowerCase().hashCode() : 0);
        result = 31 * result + (columnNames != null ? arraysHashCodeCaseInsensitive(columnNames) : 0);
        result = 31 * result + (columnTypes != null ? arraysHashCodeCaseInsensitive(columnTypes) : 0);
        result = 31 * result + (serializationLib != null ? serializationLib.hashCode() : 0);
        result = 31 * result + (location != null ? location.hashCode() : 0);
        return result;
    }

    @Override
    public String toString() {
        return "HiveTableDescriptor{" + "partitionKeys=" + Arrays.toString(partitionKeys) + ", delimiter='"
                + delimiter + '\'' + ", tableName='" + tableName + '\'' + ", databaseName='" + databaseName + '\''
                + ", columnNames=" + Arrays.toString(columnNames) + ", columnTypes=" + Arrays.toString(columnTypes)
                + ", serializationLib='" + serializationLib + '\''
                + (location != null ? ", location='" + location + '\'' : "") + '}';
    }

    private static boolean arraysEqualCaseInsensitive(String[] left, String[] right) {
        if (left.length != right.length)
            return false;

        for (int index = 0; index < left.length; index++)
            if (!left[index].equalsIgnoreCase(right[index]))
                return false;

        return true;
    }

    private boolean caseInsensitiveContains(String[] data, String key) {
        boolean found = false;
        for (int i = 0; i < data.length && !found; i++) {
            if (data[i].equalsIgnoreCase(key))
                found = true;
        }
        return found;
    }

    private static int arraysHashCodeCaseInsensitive(String[] strings) {
        String[] lower = new String[strings.length];
        for (int index = 0; index < strings.length; index++)
            lower[index] = strings[index].toLowerCase();

        return Arrays.hashCode(lower);
    }
}