cascading.tap.hive.HiveTap.java Source code

Java tutorial

Introduction

Here is the source code for cascading.tap.hive.HiveTap.java

Source

/*
* Copyright (c) 2007-2014 Concurrent, Inc. All Rights Reserved.
*
* Project and contact information: http://www.cascading.org/
*
* This file is part of the Cascading project.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package cascading.tap.hive;

import java.io.IOException;
import java.io.InputStream;
import java.util.List;
import java.util.Properties;

import cascading.CascadingException;
import cascading.property.AppProps;
import cascading.scheme.Scheme;
import cascading.tap.SinkMode;
import cascading.tap.TapException;
import cascading.tap.hadoop.Hfs;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.conf.HiveConf.ConfVars;
import org.apache.hadoop.hive.metastore.HiveMetaHook;
import org.apache.hadoop.hive.metastore.HiveMetaHookLoader;
import org.apache.hadoop.hive.metastore.HiveMetaStoreClient;
import org.apache.hadoop.hive.metastore.IMetaStoreClient;
import org.apache.hadoop.hive.metastore.RetryingMetaStoreClient;
import org.apache.hadoop.hive.metastore.api.AlreadyExistsException;
import org.apache.hadoop.hive.metastore.api.Database;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.hadoop.hive.metastore.api.InvalidObjectException;
import org.apache.hadoop.hive.metastore.api.MetaException;
import org.apache.hadoop.hive.metastore.api.NoSuchObjectException;
import org.apache.hadoop.hive.metastore.api.Partition;
import org.apache.hadoop.hive.metastore.api.StorageDescriptor;
import org.apache.hadoop.hive.metastore.api.Table;
import org.apache.hadoop.mapred.JobConf;
import org.apache.thrift.TException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * HiveTap is Tap implementation, which can create Hive tables on HDFS. HiveTap supports a strict mode, which will
 * make sure that an existing table has the same structure as the one requested by the user. This behaviour is off by
 * default and can be enabled by passing strict=true to the constructor.
 */
public class HiveTap extends Hfs {
    /** Field LOG */
    private static final Logger LOG = LoggerFactory.getLogger(HiveTap.class);

    static {
        // add cascading-jdbc release to frameworks
        Properties properties = new Properties();
        InputStream stream = HiveTap.class.getClassLoader().getResourceAsStream("cascading/framework.properties");
        if (stream != null) {
            try {
                properties.load(stream);
                stream.close();
            } catch (IOException exception) {
                // ingore
            }
        }
        String framework = properties.getProperty("name");
        AppProps.addApplicationFramework(null, framework);
    }

    /** TableDescriptor for the table. */
    private final HiveTableDescriptor tableDescriptor;

    /** HiveConf object */
    private transient HiveConf hiveConf;

    /** strict mode enforces that an existing table has to match the given TableDescriptor */
    private boolean strict;

    /** last modified time */
    private long modifiedTime;

    /**
     * Constructs a new HiveTap instance.
     *
     * @param tableDesc The HiveTableDescriptor for creating and validating Hive tables.
     * @param scheme    The Scheme to be used by the Tap.
     */
    public HiveTap(HiveTableDescriptor tableDesc, Scheme scheme) {
        this(tableDesc, scheme, SinkMode.KEEP, false);
    }

    /**
     * Constructs a new HiveTap instance.
     *
     * @param tableDesc The HiveTableDescriptor for creating and validating Hive tables.
     * @param scheme    The Scheme to be used by the Tap.
     * @param mode      The SinkMode to use
     * @param strict    Enables and disables strict validation of hive tables.
     */
    public HiveTap(HiveTableDescriptor tableDesc, Scheme scheme, SinkMode mode, boolean strict) {
        super(scheme, null, mode);
        this.tableDescriptor = tableDesc;
        this.strict = strict;
        setScheme(scheme);
        setFilesystemLocation();
    }

    @Override
    public boolean createResource(JobConf conf) throws IOException {
        if (!resourceExists(conf))
            return createHiveTable();
        return true;
    }

    /**
     * Private method to create Hive table in the MetaStore.
     *
     * @return true, if the table has been created successfully.
     * @throws IOException In case an interaction with the Hive metastore fails.
     */
    private boolean createHiveTable() throws IOException {
        IMetaStoreClient metaStoreClient = null;
        try {
            metaStoreClient = createMetaStoreClient();
            Table hiveTable = tableDescriptor.toHiveTable();
            try {
                metaStoreClient.getDatabase(tableDescriptor.getDatabaseName());
            }
            // there is no databaseExists method in hive 0.10, so we have to use exceptions for flow control.
            catch (NoSuchObjectException exception) {
                LOG.info("creating database '{}' at '{}' ", tableDescriptor.getDatabaseName(),
                        getPath().getParent().toString());
                Database db = new Database(tableDescriptor.getDatabaseName(), "created by Cascading",
                        getPath().getParent().toString(), null);

                try {
                    metaStoreClient.createDatabase(db);
                } catch (MetaException ex) {
                    // Ignore exceptions caused by the database already existing.
                    // This is caused by two hive taps running in parallel and writing to the same database.
                    if (!databaseConcurrentlyCreated(ex)) {
                        throw ex;
                    }
                }
            }
            LOG.info("creating table '{}' at '{}' ", tableDescriptor.getTableName(), getPath().toString());

            metaStoreClient.createTable(hiveTable);
            modifiedTime = System.currentTimeMillis();
            return true;
        } catch (MetaException exception) {
            throw new IOException(exception);
        } catch (TException exception) {
            throw new IOException(exception);
        } finally {
            if (metaStoreClient != null)
                metaStoreClient.close();
        }
    }

    /**
     * Check if the cause of the exception is that the same database has already been created.
     *
     * @param exception the MetaException that has been caught trying to create the database.
     * @returns true if the exception indicates that the same database already exists.
     */
    private boolean databaseConcurrentlyCreated(MetaException exception) {
        return exception.getCause() != null && exception.getCause().getCause() != null
                && exception.getCause().getCause().getMessage().startsWith(
                        "The statement was aborted because it would have caused a duplicate key value in a unique or primary key constraint or unique index identified by 'UNIQUE_DATABASE'");
    }

    @Override
    public boolean resourceExists(JobConf conf) throws IOException {
        IMetaStoreClient metaStoreClient = null;
        try {
            metaStoreClient = createMetaStoreClient();
            Table table = metaStoreClient.getTable(tableDescriptor.getDatabaseName(),
                    tableDescriptor.getTableName());
            modifiedTime = table.getLastAccessTime();
            // check if the schema matches the table descriptor. If not, throw an exception.
            if (strict) {
                LOG.info("strict mode: comparing existing hive table with table descriptor");
                if (!table.getTableType().equals(tableDescriptor.toHiveTable().getTableType()))
                    throw new HiveTableValidationException(
                            String.format("expected a table of type '%s' but found '%s'",
                                    tableDescriptor.toHiveTable().getTableType(), table.getTableType()));

                // Check that the paths are the same
                FileSystem fs = FileSystem.get(conf);
                StorageDescriptor sd = table.getSd();
                Path expectedPath = fs.makeQualified(
                        new Path(tableDescriptor.getLocation(hiveConf.getVar(ConfVars.METASTOREWAREHOUSE))));
                Path actualPath = fs.makeQualified(new Path(sd.getLocation()));

                if (!expectedPath.equals(actualPath))
                    throw new HiveTableValidationException(
                            String.format("table in MetaStore does not have the sampe path. Expected %s got %s",
                                    expectedPath, actualPath));

                List<FieldSchema> schemaList = sd.getCols();
                if (schemaList.size() != tableDescriptor.getColumnNames().length
                        - tableDescriptor.getPartitionKeys().length)
                    throw new HiveTableValidationException(String.format(
                            "table in MetaStore does not have same number of columns. expected %d got %d",
                            tableDescriptor.getColumnNames().length - tableDescriptor.getPartitionKeys().length,
                            schemaList.size()));
                for (int index = 0; index < schemaList.size(); index++) {
                    FieldSchema schema = schemaList.get(index);
                    String expectedColumnName = tableDescriptor.getColumnNames()[index];
                    String expectedColumnType = tableDescriptor.getColumnTypes()[index];
                    // this could be extended to the StorageDescriptor if necessary.
                    if (!schema.getName().equalsIgnoreCase(expectedColumnName))
                        throw new HiveTableValidationException(
                                String.format("hive schema mismatch: expected column name '%s', but found '%s'",
                                        expectedColumnName, schema.getName()));
                    if (!schema.getType().equalsIgnoreCase(expectedColumnType))
                        throw new HiveTableValidationException(
                                String.format("hive schema mismatch: expected column type '%s', but found '%s'",
                                        expectedColumnType, schema.getType()));
                }
                List<FieldSchema> schemaPartitions = table.getPartitionKeys();
                if (schemaPartitions.size() != tableDescriptor.getPartitionKeys().length)
                    throw new HiveTableValidationException(String.format(
                            "table in MetaStore does not have same number of partition columns. expected %d got %d",
                            tableDescriptor.getPartitionKeys().length, schemaPartitions.size()));
                int offset = tableDescriptor.getColumnNames().length - tableDescriptor.getPartitionKeys().length;
                for (int index = 0; index < schemaPartitions.size(); index++) {
                    FieldSchema schema = schemaPartitions.get(index);
                    String expectedColumnName = tableDescriptor.getColumnNames()[index + offset];
                    String expectedColumnType = tableDescriptor.getColumnTypes()[index + offset];
                    // this could be extended to the StorageDescriptor if necessary.
                    if (!schema.getName().equalsIgnoreCase(expectedColumnName))
                        throw new HiveTableValidationException(String.format(
                                "hive partition schema mismatch: expected column name '%s', but found '%s'",
                                expectedColumnName, schema.getName()));
                    if (!schema.getType().equalsIgnoreCase(expectedColumnType))
                        throw new HiveTableValidationException(String.format(
                                "hive partition schema mismatch: expected column type '%s', but found '%s'",
                                expectedColumnType, schema.getType()));
                }
            }
            return true;
        } catch (MetaException exception) {
            throw new IOException(exception);
        } catch (NoSuchObjectException exception) {
            return false;
        } catch (TException exception) {
            throw new IOException(exception);
        } finally {
            if (metaStoreClient != null)
                metaStoreClient.close();
        }
    }

    @Override
    public boolean deleteResource(JobConf conf) throws IOException {
        // clean up HDFS
        super.deleteResource(conf);

        IMetaStoreClient metaStoreClient = null;
        try {
            LOG.info("dropping hive table {} in database {}", tableDescriptor.getTableName(),
                    tableDescriptor.getDatabaseName());
            metaStoreClient = createMetaStoreClient();
            metaStoreClient.dropTable(tableDescriptor.getDatabaseName(), tableDescriptor.getTableName(), true,
                    true);
        } catch (MetaException exception) {
            throw new IOException(exception);
        } catch (NoSuchObjectException exception) {
            throw new IOException(exception);
        } catch (TException exception) {
            throw new IOException(exception);
        } finally {
            if (metaStoreClient != null)
                metaStoreClient.close();
        }
        return true;
    }

    /**
     * Registers a new Partition of a HiveTable. If the Partition already exists, it is ignored. If the current
     * table is not partitioned, the call is also ignored.
     *
     * @param conf      JobConf object of the current flow.
     * @param partition The partition to register.
     * @throws IOException In case any interaction with the HiveMetaStore fails.
     */
    void registerPartition(JobConf conf, Partition partition) throws IOException {
        if (!tableDescriptor.isPartitioned())
            return;

        if (!resourceExists(conf))
            createHiveTable();

        IMetaStoreClient metaStoreClient = null;
        try {
            metaStoreClient = createMetaStoreClient();
            metaStoreClient.add_partition(partition);
        } catch (MetaException exception) {
            throw new IOException(exception);
        } catch (InvalidObjectException exception) {
            throw new IOException(exception);
        } catch (AlreadyExistsException exception) {
            // ignore
        } catch (TException exception) {
            throw new IOException(exception);
        } finally {
            if (metaStoreClient != null)
                metaStoreClient.close();
        }
    }

    @Override
    public boolean commitResource(JobConf conf) throws IOException {
        boolean result = true;
        try {
            if (!resourceExists(conf))
                result = createHiveTable();
        } catch (IOException exception) {
            throw new TapException(exception);
        }
        return super.commitResource(conf) && result;
    }

    @Override
    public long getModifiedTime(JobConf conf) throws IOException {
        return modifiedTime;
    }

    /**
     * Internal method to get access to the HiveTableDescriptor of the HiveTap.
     *
     * @return The HiveTableDescriptor.
     */
    HiveTableDescriptor getTableDescriptor() {
        return tableDescriptor;
    }

    /**
     * Private method that sets the correct location of the files on HDFS. For an existing table
     * it uses the value from the Hive MetaStore. Otherwise it uses the default location for Hive.
     *
     * */
    private void setFilesystemLocation() {
        // If the table already exists get the location otherwise use the location from the table descriptor.
        IMetaStoreClient metaStoreClient = null;
        try {
            metaStoreClient = createMetaStoreClient();
            Table table = metaStoreClient.getTable(tableDescriptor.getDatabaseName(),
                    tableDescriptor.getTableName());
            String path = table.getSd().getLocation();
            setStringPath(path);
        } catch (MetaException exception) {
            throw new CascadingException(exception);
        } catch (NoSuchObjectException exception) {
            setStringPath(tableDescriptor.getLocation(hiveConf.getVar(ConfVars.METASTOREWAREHOUSE)));
        } catch (TException exception) {
            throw new CascadingException(exception);
        } finally {
            if (metaStoreClient != null)
                metaStoreClient.close();
        }
    }

    /**
     * Private helper method to create a IMetaStore client.
     *
     * @return a new IMetaStoreClient
     * @throws MetaException in case the creation fails.
     */
    private IMetaStoreClient createMetaStoreClient() throws MetaException {
        // it is a bit unclear if it is safe to re-use these instances, so we create a
        // new one every time, to be sure
        if (hiveConf == null)
            hiveConf = new HiveConf();

        return RetryingMetaStoreClient.getProxy(hiveConf, new HiveMetaHookLoader() {
            @Override
            public HiveMetaHook getHook(Table tbl) throws MetaException {
                return null;
            }
        }, HiveMetaStoreClient.class.getName());
    }
}