Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hive.streaming; import java.io.IOException; import java.io.InputStream; import java.net.InetAddress; import java.net.UnknownHostException; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.UUID; import java.util.concurrent.atomic.AtomicBoolean; import org.apache.hadoop.classification.InterfaceStability; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.metastore.HiveMetaStoreUtils; import org.apache.hadoop.hive.metastore.IMetaStoreClient; import org.apache.hadoop.hive.metastore.Warehouse; import org.apache.hadoop.hive.metastore.api.AlreadyExistsException; import org.apache.hadoop.hive.metastore.api.MetaException; import org.apache.hadoop.hive.metastore.api.Partition; import org.apache.hadoop.hive.metastore.api.TxnToWriteId; import org.apache.hadoop.hive.metastore.conf.MetastoreConf; import org.apache.hadoop.hive.ql.io.AcidUtils; import org.apache.hadoop.hive.ql.lockmgr.DbTxnManager; import org.apache.hadoop.hive.ql.metadata.Hive; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.metadata.Table; import org.apache.hadoop.hive.ql.plan.AddPartitionDesc; import org.apache.hadoop.security.UserGroupInformation; import org.apache.hive.common.util.ShutdownHookManager; import org.apache.thrift.TException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.google.common.annotations.VisibleForTesting; /** * Streaming connection implementation for hive. To create a streaming connection, use the builder API * to create record writer first followed by the connection itself. Once connection is created, clients can * begin a transaction, keep writing using the connection, commit the transaction and close connection when done. * To bind to the correct metastore, HiveConf object has to be created from hive-site.xml or HIVE_CONF_DIR. * If hive conf is manually created, metastore uri has to be set correctly. If hive conf object is not specified, * "thrift://localhost:9083" will be used as default. * <br/><br/> * NOTE: The streaming connection APIs and record writer APIs are not thread-safe. Streaming connection creation, * begin/commit/abort transactions, write and close has to be called in the same thread. If close() or * abortTransaction() has to be triggered from a separate thread it has to be co-ordinated via external variables or * synchronization mechanism * <br/><br/> * Example usage: * <pre>{@code * // create delimited record writer whose schema exactly matches table schema * StrictDelimitedInputWriter writer = StrictDelimitedInputWriter.newBuilder() * .withFieldDelimiter(',') * .build(); * // create and open streaming connection (default.src table has to exist already) * StreamingConnection connection = HiveStreamingConnection.newBuilder() * .withDatabase("default") * .withTable("src") * .withAgentInfo("nifi-agent") * .withRecordWriter(writer) * .withHiveConf(hiveConf) * .connect(); * // begin a transaction, write records and commit 1st transaction * connection.beginTransaction(); * connection.write("key1,val1".getBytes()); * connection.write("key2,val2".getBytes()); * connection.commitTransaction(); * // begin another transaction, write more records and commit 2nd transaction * connection.beginTransaction(); * connection.write("key3,val3".getBytes()); * connection.write("key4,val4".getBytes()); * connection.commitTransaction(); * // close the streaming connection * connection.close(); * } * </pre> */ public class HiveStreamingConnection implements StreamingConnection { private static final Logger LOG = LoggerFactory.getLogger(HiveStreamingConnection.class.getName()); private static final String DEFAULT_METASTORE_URI = "thrift://localhost:9083"; private static final int DEFAULT_TRANSACTION_BATCH_SIZE = 1; private static final boolean DEFAULT_STREAMING_OPTIMIZATIONS_ENABLED = true; public enum TxnState { INACTIVE("I"), OPEN("O"), COMMITTED("C"), ABORTED("A"), PREPARED_FOR_COMMIT("P"); private final String code; TxnState(String code) { this.code = code; } public String toString() { return code; } } // fields populated from builder private String database; private String table; private List<String> staticPartitionValues; private String agentInfo; private int transactionBatchSize; private RecordWriter recordWriter; private StreamingTransaction currentTransactionBatch; private HiveConf conf; private boolean streamingOptimizations; private AtomicBoolean isConnectionClosed = new AtomicBoolean(false); // internal fields private boolean isPartitionedTable; private IMetaStoreClient msClient; private IMetaStoreClient heartbeatMSClient; private final String username; private final boolean secureMode; private Table tableObject = null; private String metastoreUri; private ConnectionStats connectionStats; private final Long writeId; private final Integer statementId; private boolean manageTransactions; private int countTransactions = 0; private Set<String> partitions; private Long tableId; private Runnable onShutdownRunner; private HiveStreamingConnection(Builder builder) throws StreamingException { this.database = builder.database.toLowerCase(); this.table = builder.table.toLowerCase(); this.staticPartitionValues = builder.staticPartitionValues; this.conf = builder.hiveConf; this.agentInfo = builder.agentInfo; this.streamingOptimizations = builder.streamingOptimizations; this.writeId = builder.writeId; this.statementId = builder.statementId; this.tableObject = builder.tableObject; this.setPartitionedTable(builder.isPartitioned); this.manageTransactions = builder.manageTransactions; UserGroupInformation loggedInUser = null; try { loggedInUser = UserGroupInformation.getLoginUser(); } catch (IOException e) { LOG.warn("Unable to get logged in user via UGI. err: {}", e.getMessage()); } if (loggedInUser == null) { this.username = System.getProperty("user.name"); this.secureMode = false; } else { this.username = loggedInUser.getShortUserName(); this.secureMode = loggedInUser.hasKerberosCredentials(); } this.transactionBatchSize = builder.transactionBatchSize; this.recordWriter = builder.recordWriter; this.connectionStats = new ConnectionStats(); if (agentInfo == null) { try { agentInfo = username + ":" + InetAddress.getLocalHost().getHostName() + ":" + Thread.currentThread().getName(); } catch (UnknownHostException e) { // ignore and use UUID instead this.agentInfo = UUID.randomUUID().toString(); } } if (conf == null) { conf = createHiveConf(this.getClass(), DEFAULT_METASTORE_URI); } overrideConfSettings(conf); if (manageTransactions) { this.metastoreUri = conf.get(MetastoreConf.ConfVars.THRIFT_URIS.getHiveName()); this.msClient = getMetaStoreClient(conf, metastoreUri, secureMode, "streaming-connection"); // We use a separate metastore client for heartbeat calls to ensure heartbeat RPC calls are // isolated from the other transaction related RPC calls. this.heartbeatMSClient = getMetaStoreClient(conf, metastoreUri, secureMode, "streaming-connection-heartbeat"); validateTable(); } LOG.info("STREAMING CONNECTION INFO: {}", toConnectionInfoString()); } public static Builder newBuilder() { return new Builder(); } public static class Builder { private String database; private String table; private List<String> staticPartitionValues; private String agentInfo; private HiveConf hiveConf; private int transactionBatchSize = DEFAULT_TRANSACTION_BATCH_SIZE; private boolean streamingOptimizations = DEFAULT_STREAMING_OPTIMIZATIONS_ENABLED; private RecordWriter recordWriter; private long writeId = -1; private int statementId = -1; private boolean manageTransactions = true; private Table tableObject; private boolean isPartitioned; /** * Specify database to use for streaming connection. * * @param database - db name * @return - builder */ public Builder withDatabase(final String database) { this.database = database; return this; } /** * Specify table to use for streaming connection. * * @param table - table name * @return - builder */ public Builder withTable(final String table) { this.table = table; return this; } /** * Specify the name of partition to use for streaming connection. * * @param staticPartitionValues - static partition values * @return - builder */ public Builder withStaticPartitionValues(final List<String> staticPartitionValues) { this.staticPartitionValues = staticPartitionValues == null ? null : new ArrayList<>(staticPartitionValues); return this; } /** * Specify agent info to use for streaming connection. * * @param agentInfo - agent info * @return - builder */ public Builder withAgentInfo(final String agentInfo) { this.agentInfo = agentInfo; return this; } /** * Specify hive configuration object to use for streaming connection. * Generate this object by point to already existing hive-site.xml or HIVE_CONF_DIR. * Make sure if metastore URI has been set correctly else thrift://localhost:9083 will be * used as default. * * @param hiveConf - hive conf object * @return - builder */ public Builder withHiveConf(final HiveConf hiveConf) { this.hiveConf = hiveConf; return this; } /** * Transaction batch size to use (default value is 10). This is expert level configuration. * For every transaction batch a delta directory will be created which will impact * when compaction will trigger. * NOTE: This is evolving API and is subject to change/might not be honored in future releases. * * @param transactionBatchSize - transaction batch size * @return - builder */ @InterfaceStability.Evolving public Builder withTransactionBatchSize(final int transactionBatchSize) { this.transactionBatchSize = transactionBatchSize; return this; } /** * Whether to enable streaming optimizations. This is expert level configurations. * Disabling streaming optimizations will have significant impact to performance and memory consumption. * * @param enable - flag to enable or not * @return - builder */ public Builder withStreamingOptimizations(final boolean enable) { this.streamingOptimizations = enable; return this; } /** * Record writer to use for writing records to destination table. * * @param recordWriter - record writer * @return - builder */ public Builder withRecordWriter(final RecordWriter recordWriter) { this.recordWriter = recordWriter; return this; } /** * Specify this parameter if we want the current connection * to join an ongoing transaction without having to query * the metastore to create it. * @param writeId write id * @return builder */ public Builder withWriteId(final long writeId) { this.writeId = writeId; manageTransactions = false; return this; } /** * Specify this parameter to set an statement id in the writer. * This really only makes sense to be specified when a writeId is * provided as well * @param statementId statement id * @return builder */ public Builder withStatementId(final int statementId) { this.statementId = statementId; return this; } /** * Specify the table object since sometimes no connections * to the metastore will be opened. * @param table table object. * @return builder */ public Builder withTableObject(Table table) { this.tableObject = table; this.isPartitioned = tableObject.getPartitionKeys() != null && !tableObject.getPartitionKeys().isEmpty(); return this; } /** * Returning a streaming connection to hive. * * @return - hive streaming connection */ public HiveStreamingConnection connect() throws StreamingException { if (database == null) { throw new StreamingException("Database cannot be null for streaming connection"); } if (table == null) { if (tableObject == null) { throw new StreamingException( "Table and table object cannot be " + "null for streaming connection"); } else { table = tableObject.getTableName(); } } if (tableObject != null && !tableObject.getTableName().equals(table)) { throw new StreamingException("Table must match tableObject table name"); } if (recordWriter == null) { throw new StreamingException("Record writer cannot be null for streaming connection"); } if ((writeId != -1 && tableObject == null) || (writeId == -1 && tableObject != null)) { throw new StreamingException( "If writeId is set, tableObject " + "must be set as well and vice versa"); } HiveStreamingConnection streamingConnection = new HiveStreamingConnection(this); streamingConnection.onShutdownRunner = streamingConnection::close; // assigning higher priority than FileSystem shutdown hook so that streaming connection gets closed first before // filesystem close (to avoid ClosedChannelException) ShutdownHookManager.addShutdownHook(streamingConnection.onShutdownRunner, FileSystem.SHUTDOWN_HOOK_PRIORITY + 1); Thread.setDefaultUncaughtExceptionHandler((t, e) -> streamingConnection.close()); return streamingConnection; } } private void setPartitionedTable(Boolean isPartitionedTable) { this.isPartitionedTable = isPartitionedTable; } @Override public String toString() { return "{ metaStoreUri: " + metastoreUri + ", database: " + database + ", table: " + table + " }"; } private String toConnectionInfoString() { return "{ metastore-uri: " + metastoreUri + ", " + "database: " + database + ", " + "table: " + table + ", " + "partitioned-table: " + isPartitionedTable() + ", " + "dynamic-partitioning: " + isDynamicPartitioning() + ", " + "username: " + username + ", " + "secure-mode: " + secureMode + ", " + "record-writer: " + recordWriter.getClass().getSimpleName() + ", " + "agent-info: " + agentInfo + ", " + "writeId: " + writeId + ", " + "statementId: " + statementId + " }"; } @VisibleForTesting String toTransactionString() { return currentTransactionBatch == null ? "" : currentTransactionBatch.toString(); } @Override public PartitionInfo createPartitionIfNotExists(final List<String> partitionValues) throws StreamingException { String partLocation = null; String partName = null; boolean exists = false; try { Map<String, String> partSpec = Warehouse.makeSpecFromValues(tableObject.getPartitionKeys(), partitionValues); AddPartitionDesc addPartitionDesc = new AddPartitionDesc(database, table, true); partName = Warehouse.makePartName(tableObject.getPartitionKeys(), partitionValues); partLocation = new Path(tableObject.getDataLocation(), Warehouse.makePartPath(partSpec)).toString(); addPartitionDesc.addPartition(partSpec, partLocation); Partition partition = Hive.convertAddSpecToMetaPartition(tableObject, addPartitionDesc.getPartition(0), conf); if (getMSC() == null) { // We assume it doesn't exist if we can't check it // so the driver will decide return new PartitionInfo(partName, partLocation, false); } getMSC().add_partition(partition); if (LOG.isDebugEnabled()) { LOG.debug("Created partition {} for table {}", partName, tableObject.getFullyQualifiedName()); } } catch (AlreadyExistsException e) { exists = true; } catch (HiveException | TException e) { throw new StreamingException("Unable to creation partition for values: " + partitionValues + " connection: " + toConnectionInfoString(), e); } return new PartitionInfo(partName, partLocation, exists); } /** * Returns the file that would be used to store rows under this. * parameters * @param partitionValues partition values * @param bucketId bucket id * @param minWriteId min write Id * @param maxWriteId max write Id * @param statementId statement Id * @return the location of the file. * @throws StreamingException when the path is not found */ @Override public Path getDeltaFileLocation(List<String> partitionValues, Integer bucketId, Long minWriteId, Long maxWriteId, Integer statementId) throws StreamingException { return recordWriter.getDeltaFileLocation(partitionValues, bucketId, minWriteId, maxWriteId, statementId, tableObject); } IMetaStoreClient getMSC() { connectionStats.incrementMetastoreCalls(); return msClient; } IMetaStoreClient getHeatbeatMSC() { connectionStats.incrementMetastoreCalls(); return heartbeatMSClient; } private void validateTable() throws InvalidTable, ConnectionError { try { tableObject = new Table(getMSC().getTable(database, table)); } catch (Exception e) { LOG.warn("Unable to validate the table for connection: " + toConnectionInfoString(), e); throw new InvalidTable(database, table, e); } // 1 - check that the table is Acid if (!AcidUtils.isFullAcidTable(tableObject)) { LOG.error("HiveEndPoint " + this + " must use an acid table"); throw new InvalidTable(database, table, "is not an Acid table"); } if (tableObject.getPartitionKeys() != null && !tableObject.getPartitionKeys().isEmpty()) { setPartitionedTable(true); } else { setPartitionedTable(false); } // partition values are specified on non-partitioned table if (!isPartitionedTable() && (staticPartitionValues != null && !staticPartitionValues.isEmpty())) { // Invalid if table is not partitioned, but endPoint's partitionVals is not empty String errMsg = this.toString() + " specifies partitions for un-partitioned table"; LOG.error(errMsg); throw new ConnectionError(errMsg); } } private void beginNextTransaction() throws StreamingException { if (currentTransactionBatch == null) { currentTransactionBatch = createNewTransactionBatch(); LOG.info("Opened new transaction batch {}", currentTransactionBatch); } if (currentTransactionBatch.isClosed()) { throw new StreamingException("Cannot begin next transaction on a closed streaming connection"); } if (currentTransactionBatch.remainingTransactions() == 0) { LOG.info("Transaction batch {} is done. Rolling over to next transaction batch.", currentTransactionBatch); currentTransactionBatch.close(); currentTransactionBatch = createNewTransactionBatch(); LOG.info("Rolled over to new transaction batch {}", currentTransactionBatch); } currentTransactionBatch.beginNextTransaction(); } private StreamingTransaction createNewTransactionBatch() throws StreamingException { countTransactions++; if (manageTransactions) { return new TransactionBatch(this); } else { if (countTransactions > 1) { throw new StreamingException("If a writeId is passed for the " + "construction of HiveStreaming only one transaction batch" + " can be done"); } return new UnManagedSingleTransaction(this); } } private void checkClosedState() throws StreamingException { if (isConnectionClosed.get()) { throw new StreamingException("Streaming connection is closed already."); } } private void checkState() throws StreamingException { checkClosedState(); if (currentTransactionBatch == null) { throw new StreamingException("Transaction batch is null. Missing beginTransaction?"); } if (currentTransactionBatch.getCurrentTransactionState() != TxnState.OPEN) { throw new StreamingException("Transaction state is not OPEN. Missing beginTransaction?"); } } @Override public void beginTransaction() throws StreamingException { checkClosedState(); partitions = new HashSet<>(); beginNextTransaction(); } @Override public void commitTransaction() throws StreamingException { commitTransaction(null); } @Override public void commitTransaction(Set<String> partitions) throws StreamingException { commitTransaction(partitions, null, null); } @Override public void commitTransaction(Set<String> partitions, String key, String value) throws StreamingException { checkState(); Set<String> createdPartitions = new HashSet<>(); if (partitions != null) { for (String partition : partitions) { try { PartitionInfo info = createPartitionIfNotExists(Warehouse.getPartValuesFromPartName(partition)); if (!info.isExists()) { createdPartitions.add(partition); } } catch (MetaException e) { throw new StreamingException("Partition " + partition + " is invalid.", e); } } connectionStats.incrementTotalPartitions(partitions.size()); } currentTransactionBatch.commit(createdPartitions, key, value); this.partitions.addAll(currentTransactionBatch.getPartitions()); connectionStats.incrementCreatedPartitions(createdPartitions.size()); connectionStats.incrementCommittedTransactions(); } @Override public void abortTransaction() throws StreamingException { checkState(); currentTransactionBatch.abort(); connectionStats.incrementAbortedTransactions(); } @Override public void write(final byte[] record) throws StreamingException { checkState(); currentTransactionBatch.write(record); } @Override public void write(final InputStream inputStream) throws StreamingException { checkState(); currentTransactionBatch.write(inputStream); } /** * Close connection */ @Override public void close() { if (isConnectionClosed.get()) { return; } isConnectionClosed.set(true); try { if (currentTransactionBatch != null) { currentTransactionBatch.close(); } } catch (StreamingException e) { LOG.warn("Unable to close current transaction batch: " + currentTransactionBatch, e); } finally { if (manageTransactions) { getMSC().close(); getHeatbeatMSC().close(); } //remove shutdown hook entry added while creating this connection via HiveStreamingConnection.Builder#connect() if (!ShutdownHookManager.isShutdownInProgress()) { ShutdownHookManager.removeShutdownHook(this.onShutdownRunner); } } if (LOG.isInfoEnabled()) { LOG.info("Closed streaming connection. Agent: {} Stats: {}", getAgentInfo(), getConnectionStats()); } } @Override public ConnectionStats getConnectionStats() { return connectionStats; } private static IMetaStoreClient getMetaStoreClient(HiveConf conf, String metastoreUri, boolean secureMode, String owner) throws ConnectionError { if (metastoreUri != null) { conf.set(MetastoreConf.ConfVars.THRIFT_URIS.getHiveName(), metastoreUri); } if (secureMode) { conf.setBoolean(MetastoreConf.ConfVars.USE_THRIFT_SASL.getHiveName(), true); } try { LOG.info("Creating metastore client for {}", owner); return HiveMetaStoreUtils.getHiveMetastoreClient(conf); } catch (MetaException | IOException e) { throw new ConnectionError( "Error connecting to Hive Metastore URI: " + metastoreUri + ". " + e.getMessage(), e); } } @VisibleForTesting TxnState getCurrentTransactionState() { return currentTransactionBatch.getCurrentTransactionState(); } @VisibleForTesting int remainingTransactions() { return currentTransactionBatch.remainingTransactions(); } @VisibleForTesting Long getCurrentTxnId() { return currentTransactionBatch.getCurrentTxnId(); } private HiveConf createHiveConf(Class<?> clazz, String metaStoreUri) { HiveConf conf = new HiveConf(clazz); if (metaStoreUri != null) { conf.set(MetastoreConf.ConfVars.THRIFT_URIS.getHiveName(), metaStoreUri); } return conf; } private void overrideConfSettings(HiveConf conf) { setHiveConf(conf, HiveConf.ConfVars.HIVE_TXN_MANAGER, DbTxnManager.class.getName()); setHiveConf(conf, HiveConf.ConfVars.HIVE_SUPPORT_CONCURRENCY, true); setHiveConf(conf, MetastoreConf.ConfVars.EXECUTE_SET_UGI.getHiveName()); setHiveConf(conf, HiveConf.ConfVars.DYNAMICPARTITIONINGMODE, "nonstrict"); if (streamingOptimizations) { setHiveConf(conf, HiveConf.ConfVars.HIVE_ORC_DELTA_STREAMING_OPTIMIZATIONS_ENABLED, true); } // since same thread creates metastore client for streaming connection thread and heartbeat thread we explicitly // disable metastore client cache setHiveConf(conf, HiveConf.ConfVars.METASTORE_CLIENT_CACHE_ENABLED, false); } private static void setHiveConf(HiveConf conf, HiveConf.ConfVars var, String value) { if (LOG.isDebugEnabled()) { LOG.debug("Overriding HiveConf setting : " + var + " = " + value); } conf.setVar(var, value); } private static void setHiveConf(HiveConf conf, HiveConf.ConfVars var, boolean value) { if (LOG.isDebugEnabled()) { LOG.debug("Overriding HiveConf setting : " + var + " = " + value); } conf.setBoolVar(var, value); } private static void setHiveConf(HiveConf conf, String var) { if (LOG.isDebugEnabled()) { LOG.debug("Overriding HiveConf setting : " + var + " = " + true); } conf.setBoolean(var, true); } public List<TxnToWriteId> getTxnToWriteIds() { if (currentTransactionBatch != null) { return currentTransactionBatch.getTxnToWriteIds(); } return null; } @Override public HiveConf getHiveConf() { return conf; } @Override public String getMetastoreUri() { return metastoreUri; } @Override public Table getTable() { return tableObject; } @Override public List<String> getStaticPartitionValues() { return staticPartitionValues; } @Override public String getAgentInfo() { return agentInfo; } @Override public boolean isPartitionedTable() { return isPartitionedTable; } @Override public boolean isDynamicPartitioning() { return isPartitionedTable() && (staticPartitionValues == null || staticPartitionValues.isEmpty()); } @Override public Set<String> getPartitions() { return partitions; } public String getUsername() { return username; } public String getDatabase() { return database; } public RecordWriter getRecordWriter() { return recordWriter; } public int getTransactionBatchSize() { return transactionBatchSize; } public HiveConf getConf() { return conf; } public Long getWriteId() { return writeId; } public Integer getStatementId() { return statementId; } public Long getCurrentWriteId() { return currentTransactionBatch.getCurrentWriteId(); } }