Java tutorial
/* * Copyright (C) 2015-2019 Uber Technologies, Inc. (streaming-data@uber.com) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.uber.stream.kafka.mirrormaker.manager.core; import com.alibaba.fastjson.JSON; import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONObject; import com.codahale.metrics.Counter; import com.uber.stream.kafka.mirrormaker.common.configuration.IuReplicatorConf; import com.uber.stream.kafka.mirrormaker.common.core.IHelixManager; import com.uber.stream.kafka.mirrormaker.common.core.InstanceTopicPartitionHolder; import com.uber.stream.kafka.mirrormaker.common.core.TopicPartition; import com.uber.stream.kafka.mirrormaker.common.core.TopicWorkload; import com.uber.stream.kafka.mirrormaker.common.modules.ControllerWorkloadInfo; import com.uber.stream.kafka.mirrormaker.common.utils.HelixSetupUtils; import com.uber.stream.kafka.mirrormaker.common.utils.HelixUtils; import com.uber.stream.kafka.mirrormaker.common.utils.HttpClientUtils; import com.uber.stream.kafka.mirrormaker.manager.ManagerConf; import com.uber.stream.kafka.mirrormaker.manager.reporter.HelixKafkaMirrorMakerMetricsReporter; import com.uber.stream.kafka.mirrormaker.manager.validation.SourceKafkaClusterValidationManager; import java.io.IOException; import java.net.URISyntaxException; import java.util.*; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.locks.ReentrantLock; import kafka.utils.ZKStringSerializer$; import org.I0Itec.zkclient.ZkClient; import org.apache.commons.lang.StringUtils; import org.apache.helix.*; import org.apache.helix.model.ExternalView; import org.apache.helix.model.HelixConfigScope.ConfigScopeProperty; import org.apache.helix.model.IdealState; import org.apache.helix.model.builder.HelixConfigScopeBuilder; import org.apache.helix.store.zk.ZkHelixPropertyStore; import org.apache.http.client.config.RequestConfig; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.impl.conn.PoolingHttpClientConnectionManager; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Main logic for Helix Manager-Controller * * @author hongxu */ public class ControllerHelixManager implements IHelixManager { private static final Logger LOGGER = LoggerFactory.getLogger(ControllerHelixManager.class); private static final String MANAGER_CONTROLLER_HELIX_PREFIX = "manager-controller"; private static final String CONFIG_KAFKA_CLUSTER_KEY_PREFIX = "kafka.cluster.zkStr."; private static final String SEPARATOR = "@"; private final ManagerConf _conf; private final SourceKafkaClusterValidationManager _srcKafkaValidationManager; private final String _helixZkURL; private final String _helixClusterName; private HelixManager _helixManager; private ZkHelixPropertyStore<ZNRecord> _helixPropertyStore; private HelixAdmin _helixAdmin; private String _instanceId; private final WorkerHelixManager _workerHelixManager; private LiveInstanceChangeListener _liveInstanceChangeListener; private Map<String, TopicWorkload> _pipelineWorkloadMap; private final CloseableHttpClient _httpClient; private final int _controllerPort; private final RequestConfig _requestConfig; private final int _workloadRefreshPeriodInSeconds; private final int _initMaxNumPartitionsPerRoute; private final int _maxNumPartitionsPerRoute; private final int _initMaxNumWorkersPerRoute; private final int _maxNumWorkersPerRoute; private Map<String, Map<String, Counter>> _routeToCounterMap; private static final String TOPIC_TOTAL_NUMBER = "topicTotalNumber"; private static final String TOPIC_ERROR_NUMBER = "topicErrorNumber"; private static final String CONTROLLER_TOTAL_NUMBER = "controllerTotalNumber"; private static final String CONTROLLER_ERROR_NUMBER = "controllerErrorNumber"; private static final String WORKER_TOTAL_NUMBER = "workerTotalNumber"; private static final String WORKER_ERROR_NUMBER = "workerErrorNumber"; private static final String WORKER_NUMBER_OVERRIDE = "worker_number_override"; private static final String PIPELINE_PATH = "/pipeline"; private static final Counter _availableController = new Counter(); private static final Counter _availableWorker = new Counter(); private static final Counter _nonParityTopic = new Counter(); private static final Counter _validateWrongCount = new Counter(); private static final Counter _rescaleFailedCount = new Counter(); private static final Counter _lowUrgencyValidateWrongCount = new Counter(); private static final Counter _assignedControllerCount = new Counter(); private ReentrantLock _lock = new ReentrantLock(); private Map<String, Map<String, InstanceTopicPartitionHolder>> _topicToPipelineInstanceMap; private Map<String, PriorityQueue<InstanceTopicPartitionHolder>> _pipelineToInstanceMap; private List<String> _availableControllerList; private long lastUpdateTimeMs = 0L; private ZkClient _zkClient; private boolean _enableAutoScaling = true; private boolean _enableRebalance; public ControllerHelixManager(SourceKafkaClusterValidationManager srcKafkaValidationManager, ManagerConf managerConf) { _conf = managerConf; _enableRebalance = managerConf.getEnableRebalance(); _srcKafkaValidationManager = srcKafkaValidationManager; _initMaxNumPartitionsPerRoute = managerConf.getInitMaxNumPartitionsPerRoute(); _maxNumPartitionsPerRoute = managerConf.getMaxNumPartitionsPerRoute(); _initMaxNumWorkersPerRoute = managerConf.getInitMaxNumWorkersPerRoute(); _maxNumWorkersPerRoute = managerConf.getMaxNumWorkersPerRoute(); _workloadRefreshPeriodInSeconds = managerConf.getWorkloadRefreshPeriodInSeconds(); _workerHelixManager = new WorkerHelixManager(managerConf); _pipelineWorkloadMap = new ConcurrentHashMap<>(); _helixZkURL = HelixUtils.getAbsoluteZkPathForHelix(managerConf.getManagerZkStr()); _helixClusterName = MANAGER_CONTROLLER_HELIX_PREFIX + "-" + managerConf.getManagerDeployment(); _instanceId = managerConf.getManagerInstanceId(); _topicToPipelineInstanceMap = new ConcurrentHashMap<>(); _pipelineToInstanceMap = new ConcurrentHashMap<>(); _availableControllerList = new ArrayList<>(); _routeToCounterMap = new ConcurrentHashMap<>(); _zkClient = new ZkClient(_helixZkURL, 30000, 30000, ZKStringSerializer$.MODULE$); registerMetrics(); PoolingHttpClientConnectionManager limitedConnMgr = new PoolingHttpClientConnectionManager(); // TODO: make it configurable limitedConnMgr.setDefaultMaxPerRoute(100); limitedConnMgr.setMaxTotal(100); _httpClient = HttpClients.createMinimal(limitedConnMgr); _controllerPort = managerConf.getControllerPort(); // requestConfig is immutable. These three timeouts are for // 1. getting connection from connection manager; // 2. establishing connection with server; // 3. getting next data snippet from server. _requestConfig = RequestConfig.custom().setConnectionRequestTimeout(30000).setConnectTimeout(30000) .setSocketTimeout(30000).build(); } public synchronized void start() { LOGGER.info("Trying to start ManagerControllerHelix!"); _workerHelixManager.start(); _helixManager = HelixSetupUtils.setup(_helixClusterName, _helixZkURL, _instanceId); _helixAdmin = _helixManager.getClusterManagmentTool(); _helixPropertyStore = _helixManager.getHelixPropertyStore(); updateCurrentStatus(); LOGGER.info("Trying to register ControllerLiveInstanceChangeListener"); _liveInstanceChangeListener = new ControllerLiveInstanceChangeListener(this, _helixManager, _workloadRefreshPeriodInSeconds); try { _helixManager.addLiveInstanceChangeListener(_liveInstanceChangeListener); } catch (Exception e) { LOGGER.error("Failed to add ControllerLiveInstanceChangeListener"); } } public synchronized void stop() throws IOException { LOGGER.info("Trying to stop ManagerControllerHelix!"); _zkClient.close(); _workerHelixManager.stop(); _helixManager.disconnect(); _httpClient.close(); } private void registerMetrics() { try { HelixKafkaMirrorMakerMetricsReporter.get().registerMetric("controller.available.counter", _availableController); HelixKafkaMirrorMakerMetricsReporter.get().registerMetric("worker.available.counter", _availableWorker); HelixKafkaMirrorMakerMetricsReporter.get().registerMetric("topic.non-parity.counter", _nonParityTopic); HelixKafkaMirrorMakerMetricsReporter.get().registerMetric("validate.wrong.counter", _validateWrongCount); HelixKafkaMirrorMakerMetricsReporter.get().registerMetric("rescale.failed.counter", _rescaleFailedCount); HelixKafkaMirrorMakerMetricsReporter.get().registerMetric("validate.wrong.low.urgency.counter", _lowUrgencyValidateWrongCount); HelixKafkaMirrorMakerMetricsReporter.get().registerMetric("controller.assigned.counter", _assignedControllerCount); } catch (Exception e) { LOGGER.error("Error registering metrics!", e); } } private void maybeRegisterMetrics(String route) { if (!_routeToCounterMap.containsKey(route)) { _routeToCounterMap.putIfAbsent(route, new ConcurrentHashMap<>()); _routeToCounterMap.get(route).put(TOPIC_TOTAL_NUMBER, new Counter()); //_routeToCounterMap.get(routeString).put(TOPIC_ERROR_NUMBER, new Counter()); _routeToCounterMap.get(route).put(CONTROLLER_TOTAL_NUMBER, new Counter()); //_routeToCounterMap.get(routeString).put(CONTROLLER_ERROR_NUMBER, new Counter()); _routeToCounterMap.get(route).put(WORKER_TOTAL_NUMBER, new Counter()); //_routeToCounterMap.get(routeString).put(WORKER_ERROR_NUMBER, new Counter()); try { HelixKafkaMirrorMakerMetricsReporter.get().registerMetric(route + ".topic.totalNumber", _routeToCounterMap.get(route).get(TOPIC_TOTAL_NUMBER)); //HelixKafkaMirrorMakerMetricsReporter.get().registerMetric(routeString + ".topic.errorNumber", // _routeToCounterMap.get(routeString).get(TOPIC_ERROR_NUMBER)); HelixKafkaMirrorMakerMetricsReporter.get().registerMetric(route + ".controller.totalNumber", _routeToCounterMap.get(route).get(CONTROLLER_TOTAL_NUMBER)); //HelixKafkaMirrorMakerMetricsReporter.get().registerMetric(routeString + "controller.errorNumber", // _routeToCounterMap.get(routeString).get(CONTROLLER_ERROR_NUMBER)); HelixKafkaMirrorMakerMetricsReporter.get().registerMetric(route + ".worker.totalNumber", _routeToCounterMap.get(route).get(WORKER_TOTAL_NUMBER)); //HelixKafkaMirrorMakerMetricsReporter.get().registerMetric(routeString + "worker.errorNumber", // _routeToCounterMap.get(routeString).get(WORKER_ERROR_NUMBER)); } catch (Exception e) { LOGGER.error("Error registering metrics!", e); } } } private String convert(String route) { return route.replace('@', '-').substring(1); } private void validateInstanceToTopicPartitionsMap(Map<String, Set<TopicPartition>> instanceToTopicPartitionsMap, Map<String, InstanceTopicPartitionHolder> instanceMap) { LOGGER.info("\n\nFor controller instanceToTopicPartitionsMap:"); Map<String, String> instanceIdAndNameMap = HelixUtils.getInstanceToHostnameMap(_helixManager); int validateWrongCount = 0; int lowUrgencyValidateWrongCount = 0; for (String instanceId : instanceToTopicPartitionsMap.keySet()) { String hostname = instanceIdAndNameMap.containsKey(instanceId) ? instanceIdAndNameMap.get(instanceId) : ""; Set<TopicPartition> topicPartitions = instanceToTopicPartitionsMap.get(instanceId); Set<TopicPartition> routeSet = new HashSet<>(); // TODO: one instance suppose to have only one route for (TopicPartition tp : topicPartitions) { String topicName = tp.getTopic(); if (topicName.startsWith(SEPARATOR)) { routeSet.add(tp); } } if (routeSet.size() != 1) { Set<String> topicRouteSet = new HashSet<>(); for (TopicPartition tp : topicPartitions) { String topicName = tp.getTopic(); if (!topicName.startsWith(SEPARATOR)) { topicRouteSet.add(tp.getPipeline()); } } validateWrongCount++; LOGGER.error( "Validate WRONG: Incorrect route found for Hostname: {}, InstanceId: {}, route: {}, pipelines: {}, #workers: {}, worker: {}", hostname, instanceId, routeSet, topicRouteSet, instanceMap.get(instanceId).getWorkerSet().size(), instanceMap.get(instanceId).getWorkerSet()); } else { int partitionCount = 0; Set<TopicPartition> mismatchTopicPartition = new HashSet<>(); TopicPartition route = routeSet.iterator().next(); String routeString = route.getTopic() + SEPARATOR + route.getPartition(); for (TopicPartition tp : topicPartitions) { String topicName = tp.getTopic(); if (!topicName.startsWith(SEPARATOR)) { partitionCount += tp.getPartition(); if (!tp.getPipeline().equals(routeString)) { mismatchTopicPartition.add(tp); } } } if (mismatchTopicPartition.isEmpty() && StringUtils.isNotEmpty(hostname)) { LOGGER.info( "Validate OK: Hostname: {}, InstanceId: {}, route: {}, #topics: {}, #partitions: {}, #workers: {}, worker: {}", hostname, instanceId, routeSet, topicPartitions.size() - 1, partitionCount, instanceMap.get(instanceId).getWorkerSet().size(), instanceMap.get(instanceId).getWorkerSet()); try { // try find topic mismatch between manager and controller String topicResult = HttpClientUtils.getData(_httpClient, _requestConfig, hostname, _controllerPort, "/topics"); LOGGER.debug("Get topics from {}: {}", hostname, topicResult); String rawTopicNames = topicResult; if (!rawTopicNames.equals("No topic is added in MirrorMaker Controller!")) { rawTopicNames = topicResult.substring(25, topicResult.length() - 1); } Set<String> controllerTopics = new HashSet<>(); if (!rawTopicNames.equals("No topic is added in MirrorMaker Controller!")) { String[] topicNames = rawTopicNames.split(", "); for (String name : topicNames) { controllerTopics.add(name); } } Set<String> topicOnlyInManager = new HashSet<>(); for (TopicPartition tp : topicPartitions) { if (!controllerTopics.contains(tp.getTopic())) { topicOnlyInManager.add(tp.getTopic()); } else { controllerTopics.remove(tp.getTopic()); } } if (topicOnlyInManager.size() > 1 || (topicOnlyInManager.size() == 1 && !topicOnlyInManager.iterator().next().startsWith(SEPARATOR))) { validateWrongCount++; LOGGER.error( "Validate WRONG: Hostname: {}, InstanceId: {}, route: {}, topic only in manager: {}", hostname, instanceId, routeSet, topicOnlyInManager); } if (!controllerTopics.isEmpty()) { validateWrongCount++; LOGGER.error( "Validate WRONG: Hostname: {}, InstanceId: {}, route: {}, topic only in controller: {}", hostname, instanceId, routeSet, controllerTopics); } } catch (Exception e) { validateWrongCount++; LOGGER.error("Validate WRONG: Get topics error when connecting to {} for route {}", hostname, routeSet, e); } try { // try find worker mismatch between manager and controller String instanceResult = HttpClientUtils.getData(_httpClient, _requestConfig, hostname, _controllerPort, "/instances"); LOGGER.debug("Get workers from {}: {}", hostname, instanceResult); JSONObject instanceResultJson = JSON.parseObject(instanceResult); JSONArray allInstances = instanceResultJson.getJSONArray("allInstances"); Set<String> controllerWorkers = new HashSet<>(); for (Object instance : allInstances) { controllerWorkers.add(String.valueOf(instance)); } Set<String> managerWorkers = instanceMap.get(instanceId).getWorkerSet(); Set<String> workerOnlyInManager = new HashSet<>(); for (String worker : managerWorkers) { if (!controllerWorkers.contains(worker)) { workerOnlyInManager.add(worker); } else { controllerWorkers.remove(worker); } } if (!workerOnlyInManager.isEmpty()) { lowUrgencyValidateWrongCount++; LOGGER.warn( "Validate WRONG: Hostname: {}, InstanceId: {}, route: {}, worker only in manager: {}", hostname, instanceId, routeSet, workerOnlyInManager); } if (!controllerWorkers.isEmpty()) { validateWrongCount++; LOGGER.error( "Validate WRONG: Hostname: {}, InstanceId: {}, route: {}, worker only in controller: {}", hostname, instanceId, routeSet, controllerWorkers); } } catch (Exception e) { validateWrongCount++; LOGGER.error("Validate WRONG: Get workers error when connecting to {} for route {}", hostname, routeSet, e); } } else if (StringUtils.isEmpty(hostname)) { validateWrongCount++; LOGGER.error("Validate WRONG: Trying to get hostname for InstanceId: {} failed ", instanceId); } else { validateWrongCount++; LOGGER.error( "Validate WRONG: mismatch route found for Hostname: {}, InstanceId: {}, route: {}, mismatch: {}, #workers: {}, worker: {}", hostname, instanceId, routeSet, mismatchTopicPartition, instanceMap.get(instanceId).getWorkerSet().size(), instanceMap.get(instanceId).getWorkerSet()); } } } LOGGER.info("\n\n"); Map<String, Set<String>> topicToRouteMap = new HashMap<>(); for (String instanceId : instanceToTopicPartitionsMap.keySet()) { Set<TopicPartition> topicPartitions = instanceToTopicPartitionsMap.get(instanceId); Set<TopicPartition> routeSet = new HashSet<>(); // TODO: one instance suppose to have only one route for (TopicPartition tp : topicPartitions) { String topicName = tp.getTopic(); if (topicName.startsWith(SEPARATOR)) { routeSet.add(tp); } } TopicPartition route = routeSet.iterator().next(); String routeString = route.getTopic() + SEPARATOR + route.getPartition(); for (TopicPartition tp : topicPartitions) { String topicName = tp.getTopic(); if (!topicName.startsWith(SEPARATOR)) { if (!topicToRouteMap.containsKey(topicName)) { topicToRouteMap.put(topicName, new HashSet<>()); topicToRouteMap.get(topicName).add(routeString); } else { Set<String> existingRouteSet = topicToRouteMap.get(topicName); Iterator<String> iter = existingRouteSet.iterator(); while (iter.hasNext()) { String existingRoute = iter.next(); if (existingRoute.split(SEPARATOR)[0].equals(routeString.split(SEPARATOR)[0])) { iter.remove(); } } if (existingRouteSet.isEmpty()) { topicToRouteMap.remove(topicName); } } } } } LOGGER.info("Non-parity topicToRouteMap: {}", topicToRouteMap); if (_helixManager.isLeader()) { _nonParityTopic.inc(topicToRouteMap.size() - _nonParityTopic.getCount()); } LOGGER.info("\n\nFor controller _pipelineToInstanceMap:"); Map<String, Set<String>> workerMap = new HashMap<>(); for (String pipeline : _pipelineToInstanceMap.keySet()) { PriorityQueue<InstanceTopicPartitionHolder> itphSet = _pipelineToInstanceMap.get(pipeline); for (InstanceTopicPartitionHolder itph : itphSet) { Set<String> workers = itph.getWorkerSet(); for (String worker : workers) { if (workerMap.containsKey(worker)) { workerMap.get(worker).add(itph.getRouteString()); } else { Set<String> routeSet = new HashSet<>(); routeSet.add(itph.getRouteString()); workerMap.put(worker, routeSet); } } } } for (String worker : workerMap.keySet()) { if (workerMap.get(worker).size() != 1) { validateWrongCount++; LOGGER.error("Validate WRONG: wrong worker assignment for worker: {}, route: {}", worker, workerMap.get(worker)); } } if (_helixManager.isLeader()) { _validateWrongCount.inc(validateWrongCount - _validateWrongCount.getCount()); _lowUrgencyValidateWrongCount .inc(lowUrgencyValidateWrongCount - _lowUrgencyValidateWrongCount.getCount()); updateMetrics(instanceToTopicPartitionsMap, instanceMap); } } private void updateMetrics(Map<String, Set<TopicPartition>> instanceToTopicPartitionsMap, Map<String, InstanceTopicPartitionHolder> instanceMap) { // int[3]: 0: #topic, 1: #controller, 2: #worker Map<String, int[]> currRouteInfo = new ConcurrentHashMap<>(); //LOGGER.info("instanceToTopicPartitionsMap: {}", instanceToTopicPartitionsMap); for (String instanceName : instanceToTopicPartitionsMap.keySet()) { Set<TopicPartition> topicPartitions = instanceToTopicPartitionsMap.get(instanceName); for (TopicPartition tp : topicPartitions) { String topicName = tp.getTopic(); if (topicName.startsWith(SEPARATOR)) { // route String route = topicName + SEPARATOR + tp.getPartition(); String routeString = convert(route); currRouteInfo.putIfAbsent(routeString, new int[3]); currRouteInfo.get(routeString)[1]++; currRouteInfo.get(routeString)[2] += instanceMap.get(instanceName).getWorkerSet().size(); // register metrics if needed maybeRegisterMetrics(routeString); } else { // topic String route = tp.getPipeline(); String routeString = convert(route); currRouteInfo.putIfAbsent(routeString, new int[3]); currRouteInfo.get(routeString)[0]++; } } } //LOGGER.info("currRouteInfo: {}", currRouteInfo); //LOGGER.info("_routeToCounterMap: {}", _routeToCounterMap); for (String routeString : _routeToCounterMap.keySet()) { int topicTotalNumber = 0; int controllerTotalNumber = 0; int workerTotalNumber = 0; if (currRouteInfo.containsKey(routeString)) { topicTotalNumber = currRouteInfo.get(routeString)[0]; controllerTotalNumber = currRouteInfo.get(routeString)[1]; workerTotalNumber = currRouteInfo.get(routeString)[2]; } Counter topicTotalNumberCounter = _routeToCounterMap.get(routeString).get(TOPIC_TOTAL_NUMBER); topicTotalNumberCounter.inc(topicTotalNumber - topicTotalNumberCounter.getCount()); Counter controllerTotalNumberCounter = _routeToCounterMap.get(routeString).get(CONTROLLER_TOTAL_NUMBER); controllerTotalNumberCounter.inc(controllerTotalNumber - controllerTotalNumberCounter.getCount()); Counter workerTotalNumberCounter = _routeToCounterMap.get(routeString).get(WORKER_TOTAL_NUMBER); workerTotalNumberCounter.inc(workerTotalNumber - workerTotalNumberCounter.getCount()); // LOGGER.info("update metrics for {}", routeString); } } public synchronized void updateCurrentStatus() { _lock.lock(); try { long currTimeMs = System.currentTimeMillis(); if (currTimeMs - lastUpdateTimeMs < _conf.getUpdateStatusCoolDownMs()) { LOGGER.info("Only {} ms since last updateCurrentStatus, wait for next one", currTimeMs - lastUpdateTimeMs); return; } LOGGER.info("Trying to run controller updateCurrentStatus"); _workerHelixManager.updateCurrentStatus(); // Map<InstanceName, InstanceTopicPartitionHolder> Map<String, InstanceTopicPartitionHolder> instanceMap = new HashMap<>(); // Map<TopicName, Map<Pipeline, Instance>> Map<String, Map<String, InstanceTopicPartitionHolder>> currTopicToPipelineInstanceMap = new HashMap<>(); // Map<Pipeline, PriorityQueue<Instance>> Map<String, PriorityQueue<InstanceTopicPartitionHolder>> currPipelineToInstanceMap = new HashMap<>(); // Set<InstanceName> List<String> currAvailableControllerList = new ArrayList<>(); Map<TopicPartition, List<String>> workerRouteToInstanceMap = _workerHelixManager .getWorkerRouteToInstanceMap(); // Map<Instance, Set<Pipeline>> from IdealState Map<String, Set<TopicPartition>> instanceToTopicPartitionsMap = HelixUtils .getInstanceToTopicPartitionsMap(_helixManager, _srcKafkaValidationManager.getClusterToObserverMap()); List<String> liveInstances = HelixUtils.liveInstances(_helixManager); currAvailableControllerList.addAll(liveInstances); int assignedControllerCount = 0; for (String instanceId : instanceToTopicPartitionsMap.keySet()) { Set<TopicPartition> topicPartitions = instanceToTopicPartitionsMap.get(instanceId); // TODO: one instance suppose to have only one route for (TopicPartition tp : topicPartitions) { String topicName = tp.getTopic(); if (topicName.startsWith(SEPARATOR)) { currPipelineToInstanceMap.putIfAbsent(topicName, new PriorityQueue<>(1, InstanceTopicPartitionHolder.totalWorkloadComparator(_pipelineWorkloadMap))); InstanceTopicPartitionHolder itph = new InstanceTopicPartitionHolder(instanceId, tp); if (workerRouteToInstanceMap.get(tp) != null) { itph.addWorkers(workerRouteToInstanceMap.get(tp)); } currPipelineToInstanceMap.get(topicName).add(itph); instanceMap.put(instanceId, itph); currAvailableControllerList.remove(instanceId); assignedControllerCount++; } } for (TopicPartition tp : topicPartitions) { String topicName = tp.getTopic(); if (!topicName.startsWith(SEPARATOR)) { if (instanceMap.containsKey(instanceId)) { instanceMap.get(instanceId).addTopicPartition(tp); currTopicToPipelineInstanceMap.putIfAbsent(topicName, new ConcurrentHashMap<>()); currTopicToPipelineInstanceMap.get(tp.getTopic()) .put(getPipelineFromRoute(tp.getPipeline()), instanceMap.get(instanceId)); } } } } _pipelineToInstanceMap = currPipelineToInstanceMap; _topicToPipelineInstanceMap = currTopicToPipelineInstanceMap; _availableControllerList = currAvailableControllerList; if (_helixManager.isLeader()) { _availableController.inc(_availableControllerList.size() - _availableController.getCount()); _availableWorker .inc(_workerHelixManager.getAvailableWorkerList().size() - _availableWorker.getCount()); _assignedControllerCount.inc(assignedControllerCount - _assignedControllerCount.getCount()); } // Validation validateInstanceToTopicPartitionsMap(instanceToTopicPartitionsMap, instanceMap); //LOGGER.info("For controller _pipelineToInstanceMap: {}", _pipelineToInstanceMap); //LOGGER.info("For controller _topicToPipelineInstanceMap: {}", _topicToPipelineInstanceMap); LOGGER.info("For controller {} available", _availableControllerList.size()); lastUpdateTimeMs = System.currentTimeMillis(); } catch (Exception e) { LOGGER.error("Got exception in updateCurrentStatus", e); } finally { _lock.unlock(); } } public List<String> extractTopicList(String response) { String topicList = response.substring(25, response.length() - 1); String[] topics = topicList.split(","); List<String> result = new ArrayList<>(); for (String topic : topics) { result.add(topic); } return result; } private String getHostname(String instanceId) { Map<String, String> instanceIdAndNameMap = HelixUtils.getInstanceToHostnameMap(_helixManager); String hostname = instanceIdAndNameMap.containsKey(instanceId) ? instanceIdAndNameMap.get(instanceId) : ""; if (StringUtils.isEmpty(hostname)) { throw new InternalError(String.format("Failed to find hostname for instanceId %s", instanceId)); } return hostname; } public JSONObject getTopicInfoFromController(String topicName) { JSONObject resultJson = new JSONObject(); Map<String, InstanceTopicPartitionHolder> pipelineToInstanceMap = _topicToPipelineInstanceMap .get(topicName); for (String pipeline : pipelineToInstanceMap.keySet()) { InstanceTopicPartitionHolder itph = pipelineToInstanceMap.get(pipeline); try { String hostname = getHostname(itph.getInstanceName()); String topicResponseBody = HttpClientUtils.getData(_httpClient, _requestConfig, hostname, _controllerPort, "/topics/" + topicName); JSONObject topicsInfoInJson = JSON.parseObject(topicResponseBody); resultJson.put(itph.getRouteString(), topicsInfoInJson); } catch (Exception e) { LOGGER.warn("Failed to curl topic info from controller: {}", itph.getInstanceName(), e); } } return resultJson; } public IdealState getIdealStateForTopic(String topicName) { return _helixAdmin.getResourceIdealState(_helixClusterName, topicName); } public ExternalView getExternalViewForTopic(String topicName) { return _helixAdmin.getResourceExternalView(_helixClusterName, topicName); } public boolean isPipelineExisted(String pipeline) { return _helixAdmin.getResourcesInCluster(_helixClusterName).contains(pipeline); } public boolean isTopicExisted(String topicName) { return _helixAdmin.getResourcesInCluster(_helixClusterName).contains(topicName); } public boolean isTopicPipelineExisted(String topicName, String pipeline) { if (!isPipelineExisted(pipeline) || !isTopicExisted(topicName)) { return false; } for (String partition : getIdealStateForTopic(topicName).getPartitionSet()) { if (partition.startsWith(pipeline)) { return true; } } return false; } public List<String> getPipelineLists() { List<String> pipelineList = new ArrayList<>(); for (String resource : _helixAdmin.getResourcesInCluster(_helixClusterName)) { if (resource.startsWith(SEPARATOR)) { pipelineList.add(resource); } } return pipelineList; } public List<String> getTopicLists() { List<String> toplicList = new ArrayList<>(); for (String resource : _helixAdmin.getResourcesInCluster(_helixClusterName)) { if (!resource.startsWith(SEPARATOR)) { toplicList.add(resource); } } return toplicList; } public Map<String, Map<String, InstanceTopicPartitionHolder>> getTopicToPipelineInstanceMap() { return _topicToPipelineInstanceMap; } public Map<String, PriorityQueue<InstanceTopicPartitionHolder>> getPipelineToInstanceMap() { return _pipelineToInstanceMap; } public Map<String, InstanceTopicPartitionHolder> getTopic(String topicName) { return _topicToPipelineInstanceMap.get(topicName); } public synchronized void handleLiveInstanceChange(boolean onlyCheckOffline, boolean forceBalance) throws Exception { _lock.lock(); try { LOGGER.info("handleLiveInstanceChange() wake up!"); // Check if any controller in route is down Map<String, Set<TopicPartition>> instanceToTopicPartitionsMap = HelixUtils .getInstanceToTopicPartitionsMap(_helixManager, _srcKafkaValidationManager.getClusterToObserverMap()); List<String> liveInstances = HelixUtils.liveInstances(_helixManager); List<String> instanceToReplace = new ArrayList<>(); boolean routeControllerDown = false; // Check if any worker in route is down boolean routeWorkerDown = false; if (_enableRebalance || forceBalance) { for (String instanceName : instanceToTopicPartitionsMap.keySet()) { if (!liveInstances.contains(instanceName)) { routeControllerDown = true; instanceToReplace.add(instanceName); } } LOGGER.info("Controller need to replace: {}", instanceToReplace); // Make sure controller status is up-to-date updateCurrentStatus(); // Happy scenario: instance contains route topic for (String instance : instanceToReplace) { Set<TopicPartition> tpOrRouteSet = instanceToTopicPartitionsMap.get(instance); for (TopicPartition tpOrRoute : tpOrRouteSet) { if (tpOrRoute.getTopic().startsWith(SEPARATOR)) { String pipeline = tpOrRoute.getTopic(); int routeId = tpOrRoute.getPartition(); // TODO: check if _availableControllerList is empty String newInstanceName = _availableControllerList.get(0); _availableControllerList.remove(0); LOGGER.info("Controller {} in route {}@{} will be replaced by {}", instance, pipeline, routeId, newInstanceName); InstanceTopicPartitionHolder newInstance = new InstanceTopicPartitionHolder( newInstanceName, tpOrRoute); List<TopicPartition> tpToReassign = new ArrayList<>(); PriorityQueue<InstanceTopicPartitionHolder> itphList = _pipelineToInstanceMap .get(pipeline); for (InstanceTopicPartitionHolder itph : itphList) { if (itph.getInstanceName().equals(instance)) { tpToReassign.addAll(itph.getServingTopicPartitionSet()); // TODO: is it possible to have different route on same host? break; } } // Helix doesn't guarantee the order of execution, so we have to wait for new controller to be online // before reassigning topics // But this might cause long rebalance time _helixAdmin.setResourceIdealState(_helixClusterName, pipeline, IdealStateBuilder.resetCustomIdealStateFor( _helixAdmin.getResourceIdealState(_helixClusterName, pipeline), pipeline, String.valueOf(routeId), newInstanceName)); long ts1 = System.currentTimeMillis(); while (!isControllerOnline(newInstanceName, pipeline, String.valueOf(routeId))) { if (System.currentTimeMillis() - ts1 > 30000) { break; } try { // Based on testing, the wait time is usually in the order of 100 ms Thread.sleep(100); } catch (InterruptedException e) { e.printStackTrace(); } } long ts2 = System.currentTimeMillis(); LOGGER.info("Controller {} in route {}@{} is replaced by {}, it took {} ms", instance, pipeline, routeId, newInstanceName, ts2 - ts1); for (TopicPartition tp : tpToReassign) { _helixAdmin.setResourceIdealState(_helixClusterName, tp.getTopic(), IdealStateBuilder.resetCustomIdealStateFor( _helixAdmin.getResourceIdealState(_helixClusterName, tp.getTopic()), tp.getTopic(), pipeline + SEPARATOR + routeId, newInstanceName)); } LOGGER.info( "Controller {} in route {}@{} is replaced by {}, topics are reassigned, it took {} ms", instance, pipeline, routeId, newInstanceName, System.currentTimeMillis() - ts2); break; } } } // Failure scenario: instance doesn't contain route topic // e.g. route and the topic in that route are not assigned to the same host // In this case, assume the instance of the route is correct and reassign the topic to that host for (String instance : instanceToTopicPartitionsMap.keySet()) { Set<TopicPartition> topicPartitionSet = instanceToTopicPartitionsMap.get(instance); if (topicPartitionSet.isEmpty()) { continue; } boolean foundRoute = false; for (TopicPartition tp : topicPartitionSet) { if (tp.getTopic().startsWith(SEPARATOR)) { foundRoute = true; break; } } if (!foundRoute) { routeControllerDown = true; String instanceForRoute = null; // Find the host for its route String route = topicPartitionSet.iterator().next().getPipeline(); for (String pipeline : _pipelineToInstanceMap.keySet()) { if (pipeline.equals(getPipelineFromRoute(route))) { for (InstanceTopicPartitionHolder itph : _pipelineToInstanceMap.get(pipeline)) { if (itph.getRouteString().equals(route)) { instanceForRoute = itph.getInstanceName(); break; } } } } LOGGER.info("Need to reassign: {} from {} to {}", topicPartitionSet, instance, instanceForRoute); for (TopicPartition tp : topicPartitionSet) { _helixAdmin.setResourceIdealState(_helixClusterName, tp.getTopic(), IdealStateBuilder.resetCustomIdealStateFor( _helixAdmin.getResourceIdealState(_helixClusterName, tp.getTopic()), tp.getTopic(), route, instanceForRoute)); } } } if (routeControllerDown) { updateCurrentStatus(); } HelixManager workeManager = _workerHelixManager.getHelixManager(); Map<String, Set<TopicPartition>> workerInstanceToTopicPartitionsMap = HelixUtils .getInstanceToTopicPartitionsMap(workeManager, null); List<String> workerLiveInstances = HelixUtils.liveInstances(workeManager); Map<String, List<String>> workerPipelineToRouteIdToReplace = new HashMap<>(); List<String> workerToReplace = new ArrayList<>(); for (String instanceName : workerInstanceToTopicPartitionsMap.keySet()) { if (!workerLiveInstances.contains(instanceName)) { routeWorkerDown = true; TopicPartition route = workerInstanceToTopicPartitionsMap.get(instanceName).iterator() .next(); workerPipelineToRouteIdToReplace.putIfAbsent(route.getTopic(), new ArrayList<>()); workerPipelineToRouteIdToReplace.get(route.getTopic()) .add(String.valueOf(route.getPartition())); workerToReplace.add(instanceName); LOGGER.info("Worker changed: {} for {}", instanceName, route); } } if (!routeWorkerDown) { LOGGER.info("No worker in route is changed, do nothing!"); } else { LOGGER.info("Worker need to replace: {}, {}", workerToReplace, workerPipelineToRouteIdToReplace); // Make sure worker status is up-to-date if (!routeControllerDown) { updateCurrentStatus(); } _workerHelixManager.replaceWorkerInMirrorMaker(workerPipelineToRouteIdToReplace, workerToReplace); updateCurrentStatus(); } } else { LOGGER.info("AutoBalancing is disabled, do nothing"); } if (onlyCheckOffline) { return; } LOGGER.info("Start rebalancing current cluster"); // Haven't run updateCurrentStatus() before if (!routeControllerDown && !routeWorkerDown) { updateCurrentStatus(); } if (_enableAutoScaling) { scaleCurrentCluster(); } else { LOGGER.info("AutoScaling is disabled, do nothing"); } } finally { _lock.unlock(); } } private boolean isControllerOnline(String instance, String routeName, String routeId) { LOGGER.info("Check if {} is online for {}, {}", instance, routeName, routeId); try { String[] srcDst = routeName.split(SEPARATOR); String controllerWokerHelixClusterName = "/controller-worker-" + srcDst[1] + "-" + srcDst[2] + "-" + routeId; JSONObject json = JSON.parseObject( _zkClient.readData(controllerWokerHelixClusterName + "/CONTROLLER/LEADER").toString()); String currLeader = String.valueOf(json.get("id")); LOGGER.info("current leader is {}, expect {}", currLeader, instance); return currLeader.equals(instance); } catch (Exception e) { LOGGER.info("Got error when checking current leader", e); return false; } } public void scaleCurrentCluster() throws Exception { int oldTotalNumWorker = 0; int newTotalNumWorker = 0; Map<String, Integer> _routeWorkerOverrides = getRouteWorkerOverride(); for (String pipeline : _pipelineToInstanceMap.keySet()) { LOGGER.info("Start rescale pipeline: {}", pipeline); PriorityQueue<InstanceTopicPartitionHolder> newItphQueue = new PriorityQueue<>(1, InstanceTopicPartitionHolder.totalWorkloadComparator(_pipelineWorkloadMap)); // TODO: what if routeId is not continuous int nextRouteId = _pipelineToInstanceMap.get(pipeline).size(); for (InstanceTopicPartitionHolder itph : _pipelineToInstanceMap.get(pipeline)) { if (itph.getTotalNumPartitions() > _maxNumPartitionsPerRoute) { LOGGER.info( "Checking route {} with controller {} and topics {} since it exceeds maxNumPartitionsPerRoute {}", itph.getRouteString(), itph.getInstanceName(), itph.getServingTopicPartitionSet(), _maxNumPartitionsPerRoute); while (itph.getTotalNumPartitions() > _maxNumPartitionsPerRoute) { // Only one topic left, do nothing if (itph.getNumServingTopicPartitions() == 1) { LOGGER.info("Only one topic {} in route {}, do nothing", itph.getServingTopicPartitionSet().iterator().next(), itph.getRouteString()); break; } // Get the topic with largest number of partitions TopicPartition tpToMove = new TopicPartition("tmp", -1); for (TopicPartition tp : itph.getServingTopicPartitionSet()) { if (tp.getPartition() > tpToMove.getPartition()) { tpToMove = tp; } } // If existing lightest route cannot fit the largest topic to move if (newItphQueue.isEmpty() || newItphQueue.peek().getTotalNumPartitions() + tpToMove.getPartition() > _initMaxNumPartitionsPerRoute) { try { InstanceTopicPartitionHolder newHolder = createNewRoute(pipeline, nextRouteId); _helixAdmin.setResourceIdealState(_helixClusterName, tpToMove.getTopic(), IdealStateBuilder.resetCustomIdealStateFor( _helixAdmin.getResourceIdealState(_helixClusterName, tpToMove.getTopic()), tpToMove.getTopic(), itph.getRouteString(), newHolder.getRouteString(), newHolder.getInstanceName())); itph.removeTopicPartition(tpToMove); newHolder.addTopicPartition(tpToMove); newItphQueue.add(newHolder); nextRouteId++; } catch (Exception e) { LOGGER.error("Got exception when create a new route when rebalancing, abandon!", e); throw new Exception( "Got exception when create a new route when rebalancing, abandon!", e); } } else { InstanceTopicPartitionHolder newHolder = newItphQueue.poll(); _helixAdmin.setResourceIdealState(_helixClusterName, tpToMove.getTopic(), IdealStateBuilder.resetCustomIdealStateFor( _helixAdmin.getResourceIdealState(_helixClusterName, tpToMove.getTopic()), tpToMove.getTopic(), itph.getRouteString(), newHolder.getRouteString(), newHolder.getInstanceName())); itph.removeTopicPartition(tpToMove); newHolder.addTopicPartition(tpToMove); newItphQueue.add(newHolder); } } } newItphQueue.add(itph); } // After moving topics, scale workers based on workload int rescaleFailedCount = 0; for (InstanceTopicPartitionHolder itph : newItphQueue) { oldTotalNumWorker += itph.getWorkerSet().size(); String routeString = itph.getRouteString(); int initWorkerCount = _initMaxNumWorkersPerRoute; if (_routeWorkerOverrides.containsKey(routeString) && _routeWorkerOverrides.get(routeString) > initWorkerCount) { initWorkerCount = _routeWorkerOverrides.get(routeString); } String hostname = getHostname(itph.getInstanceName()); try { String result = HttpClientUtils.getData(_httpClient, _requestConfig, hostname, _controllerPort, "/admin/workloadinfo"); ControllerWorkloadInfo workloadInfo = JSONObject.parseObject(result, ControllerWorkloadInfo.class); TopicWorkload totalWorkload = workloadInfo.getTopicWorkload(); if (workloadInfo != null && workloadInfo.getNumOfExpectedWorkers() != 0) { _pipelineWorkloadMap.put(itph.getRouteString(), totalWorkload); int expectedNumWorkers = workloadInfo.getNumOfExpectedWorkers(); LOGGER.info("Current {} workers in route {}, expect {} workers", itph.getWorkerSet().size(), itph.getRouteString(), expectedNumWorkers); int actualExpectedNumWorkers = getActualExpectedNumWorkers(expectedNumWorkers, initWorkerCount); LOGGER.info("Current {} workers in route {}, actual expect {} workers", itph.getWorkerSet().size(), itph.getRouteString(), actualExpectedNumWorkers); if (actualExpectedNumWorkers > itph.getWorkerSet().size()) { LOGGER.info("Current {} workers in route {}, actual expect {} workers, add {} workers", itph.getWorkerSet().size(), itph.getRouteString(), actualExpectedNumWorkers, actualExpectedNumWorkers - itph.getWorkerSet().size()); // TODO: handle exception _workerHelixManager.addWorkersToMirrorMaker(itph, itph.getRoute().getTopic(), itph.getRoute().getPartition(), actualExpectedNumWorkers - itph.getWorkerSet().size()); } if (actualExpectedNumWorkers < itph.getWorkerSet().size()) { LOGGER.info( "Current {} workers in route {}, actual expect {} workers, remove {} workers", itph.getWorkerSet().size(), itph.getRouteString(), actualExpectedNumWorkers, itph.getWorkerSet().size() - actualExpectedNumWorkers); // TODO: handle exception _workerHelixManager.removeWorkersToMirrorMaker(itph, itph.getRoute().getTopic(), itph.getRoute().getPartition(), itph.getWorkerSet().size() - actualExpectedNumWorkers); } newTotalNumWorker += actualExpectedNumWorkers; } else { LOGGER.warn("Get workload on {} for route: {} returns 0. No change on number of workers", hostname, itph.getRouteString()); newTotalNumWorker += itph.getWorkerSet().size(); rescaleFailedCount++; } } catch (Exception e) { rescaleFailedCount++; LOGGER.error(String.format( "Get workload error when connecting to %s for route %s. No change on number of workers", hostname, itph.getRouteString()), e); newTotalNumWorker += itph.getWorkerSet().size(); rescaleFailedCount++; } } _pipelineToInstanceMap.put(pipeline, newItphQueue); _rescaleFailedCount.inc(rescaleFailedCount - _rescaleFailedCount.getCount()); } LOGGER.info("oldTotalNumWorker: {}, newTotalNumWorker: {}", oldTotalNumWorker, newTotalNumWorker); } private int getActualExpectedNumWorkers(int expectedNumWorkers, int initWorkerPerRoute) { if (expectedNumWorkers <= initWorkerPerRoute) { return initWorkerPerRoute; } if (expectedNumWorkers >= _maxNumWorkersPerRoute) { return _maxNumWorkersPerRoute; } return (int) (Math.ceil((double) (expectedNumWorkers - initWorkerPerRoute) / 5) * 5) + initWorkerPerRoute; } public int getExpectedNumWorkers(int currNumPartitions) { return Math.min(_maxNumWorkersPerRoute, _initMaxNumWorkersPerRoute + (_maxNumWorkersPerRoute - _initMaxNumWorkersPerRoute) * (currNumPartitions - _initMaxNumPartitionsPerRoute) / (_maxNumPartitionsPerRoute - _initMaxNumPartitionsPerRoute)); } public InstanceTopicPartitionHolder createNewRoute(String pipeline, int routeId) throws Exception { if (_availableControllerList.isEmpty()) { LOGGER.info("No available controller!"); throw new Exception("No available controller!"); } if (_workerHelixManager.getAvailableWorkerList().isEmpty()) { LOGGER.info("No available worker!"); throw new Exception("No available worker!"); } String instanceName = _availableControllerList.get(0); InstanceTopicPartitionHolder instance = new InstanceTopicPartitionHolder(instanceName, new TopicPartition(pipeline, routeId)); if (!isPipelineExisted(pipeline)) { setEmptyResourceConfig(pipeline); _helixAdmin.addResource(_helixClusterName, pipeline, IdealStateBuilder.buildCustomIdealStateFor(pipeline, String.valueOf(routeId), instance)); } else { LOGGER.info("Expanding pipeline {} new partition {} to instance {}", pipeline, routeId, instance); _helixAdmin.setResourceIdealState(_helixClusterName, pipeline, IdealStateBuilder.expandCustomIdealStateFor( _helixAdmin.getResourceIdealState(_helixClusterName, pipeline), pipeline, String.valueOf(routeId), instance)); LOGGER.info("New IdealState: {}", _helixAdmin.getResourceIdealState(_helixClusterName, pipeline)); } String[] srcDst = pipeline.split(SEPARATOR); String controllerWokerHelixClusterName = "controller-worker-" + srcDst[1] + "-" + srcDst[2] + "-" + routeId; HelixManager spectator = HelixManagerFactory.getZKHelixManager(controllerWokerHelixClusterName, _instanceId, InstanceType.SPECTATOR, _helixZkURL); long ts1 = System.currentTimeMillis(); while (true) { try { spectator.connect(); break; } catch (Exception e) { // Do nothing } if (System.currentTimeMillis() - ts1 > 60000) { throw new Exception(String.format("Controller %s failed to set up new route cluster %s!", instanceName, controllerWokerHelixClusterName)); } Thread.sleep(1000); } _availableControllerList.remove(instanceName); _pipelineToInstanceMap.put(pipeline, new PriorityQueue<>(1, InstanceTopicPartitionHolder.totalWorkloadComparator(_pipelineWorkloadMap))); _pipelineToInstanceMap.get(pipeline).add(instance); _assignedControllerCount.inc(); _workerHelixManager.addTopicToMirrorMaker(instance, pipeline, routeId); // register metrics String routeString = srcDst[1] + "-" + srcDst[2] + "-" + routeId; maybeRegisterMetrics(routeString); spectator.disconnect(); return instance; } public InstanceTopicPartitionHolder maybeCreateNewRoute( PriorityQueue<InstanceTopicPartitionHolder> instanceList, String topicName, int numPartitions, String pipeline) throws Exception { LOGGER.info("maybeCreateNewRoute, topicName: {}, numPartitions: {}, pipeline: {}", topicName, numPartitions, pipeline); Set<Integer> routeIdSet = new HashSet<>(); for (InstanceTopicPartitionHolder instance : instanceList) { if (instance.getTotalNumPartitions() + numPartitions < _initMaxNumPartitionsPerRoute) { return instance; } routeIdSet.add(instance.getRoute().getPartition()); } // For now we don't delete route even it's empty. so routeId should be 0,1...N int routeId = 0; while (routeId < routeIdSet.size() + 1) { if (!routeIdSet.contains(routeId)) { break; } routeId++; } return createNewRoute(pipeline, routeId); } public synchronized void addTopicToMirrorMaker(String topicName, int numPartitions, String src, String dst, String pipeline) throws Exception { _lock.lock(); try { LOGGER.info("Trying to add topic: {} to pipeline: {}", topicName, pipeline); if (!isPipelineExisted(pipeline)) { createNewRoute(pipeline, 0); } else { LOGGER.info("Pipeline already existed!"); } boolean isSameDc = src.substring(0, 3).equals(dst.substring(0, 3)); InstanceTopicPartitionHolder instance = maybeCreateNewRoute(_pipelineToInstanceMap.get(pipeline), topicName, numPartitions, pipeline); String route = instance.getRouteString(); if (!isTopicExisted(topicName)) { setEmptyResourceConfig(topicName); _helixAdmin.addResource(_helixClusterName, topicName, IdealStateBuilder.buildCustomIdealStateFor(topicName, route, instance)); } else { _helixAdmin.setResourceIdealState(_helixClusterName, topicName, IdealStateBuilder.expandCustomIdealStateFor( _helixAdmin.getResourceIdealState(_helixClusterName, topicName), topicName, route, instance)); } instance.addTopicPartition(new TopicPartition(topicName, numPartitions, pipeline)); _topicToPipelineInstanceMap.putIfAbsent(topicName, new ConcurrentHashMap<>()); _topicToPipelineInstanceMap.get(topicName).put(pipeline, instance); } finally { _lock.unlock(); } } // TODO: fix status if accidentally expanding a topic to a larger number public synchronized void expandTopicInMirrorMaker(String topicName, String srcCluster, String pipeline, int newNumPartitions) throws Exception { _lock.lock(); try { LOGGER.info("Trying to expand topic: {} in pipeline: {} to {} partitions", topicName, pipeline, newNumPartitions); if (!isTopicPipelineExisted(topicName, pipeline)) { updateCurrentStatus(); } if (!isTopicPipelineExisted(topicName, pipeline)) { LOGGER.info("Topic {} doesn't exist in pipeline {}, abandon expanding topic", topicName, pipeline); throw new Exception(String.format("Topic %s doesn't exist in pipeline %s, abandon expanding topic!", topicName, pipeline)); } InstanceTopicPartitionHolder itph = _topicToPipelineInstanceMap.get(topicName).get(pipeline); boolean found = false; int oldNumPartitions = 0; for (TopicPartition tp : itph.getServingTopicPartitionSet()) { if (tp.getTopic().equals(topicName)) { found = true; oldNumPartitions = tp.getPartition(); if (newNumPartitions <= oldNumPartitions) { LOGGER.info( "New partition {} is not bigger than current partition {} of topic {}, abandon expanding topic", newNumPartitions, oldNumPartitions, topicName); throw new Exception(String.format( "New partition %s is not bigger than current partition %s of topic %s, " + "abandon expanding topic!", newNumPartitions, oldNumPartitions, topicName)); } } } if (!found) { LOGGER.info("Failed to find topic {} in pipeline {}, abandon expanding topic", topicName, pipeline); throw new Exception(String.format( "Failed to find topic %s in pipeline %s, abandon expanding topic!", topicName, pipeline)); } JSONObject entity = new JSONObject(); entity.put("topic", topicName); entity.put("numPartitions", newNumPartitions); String hostname = getHostname(itph.getInstanceName()); int respCode = HttpClientUtils.putData(_httpClient, _requestConfig, hostname, _controllerPort, "/topics", entity); if (respCode != 200) { LOGGER.info("Got error from controller {} when expanding topic {} with respCode {}", itph.getInstanceName(), topicName, respCode); throw new Exception( String.format("Got error from controller %s when expanding topic %s with respCode %s", itph.getInstanceName(), topicName, respCode)); } itph.removeTopicPartition(new TopicPartition(topicName, oldNumPartitions, pipeline)); itph.addTopicPartition(new TopicPartition(topicName, newNumPartitions, pipeline)); _srcKafkaValidationManager.getClusterToObserverMap().get(srcCluster).tryUpdateTopic(topicName); } finally { _lock.unlock(); } } public synchronized void deletePipelineInMirrorMaker(String pipeline) { // TODO: delete topic first _lock.lock(); try { LOGGER.info("Trying to delete pipeline: {}", pipeline); PriorityQueue<InstanceTopicPartitionHolder> itphSet = _pipelineToInstanceMap.get(pipeline); for (InstanceTopicPartitionHolder itph : itphSet) { if (itph.getTotalNumPartitions() != 0) { throw new UnsupportedOperationException( "Delete non-empty pipeline is not allowed, serving number of partitions: " + String.valueOf(itph.getTotalNumPartitions())); } } _workerHelixManager.deletePipelineInMirrorMaker(pipeline); _helixAdmin.dropResource(_helixClusterName, pipeline); _pipelineToInstanceMap.remove(pipeline); // Maybe clear instanceHolder's worker set List<String> topicsToDelete = new ArrayList<>(); for (String topic : _topicToPipelineInstanceMap.keySet()) { if (_topicToPipelineInstanceMap.get(topic).containsKey(pipeline)) { _topicToPipelineInstanceMap.get(topic).remove(pipeline); } if (_topicToPipelineInstanceMap.get(topic).isEmpty()) { topicsToDelete.add(topic); } } for (String topic : topicsToDelete) { _topicToPipelineInstanceMap.remove(topic); } } finally { _lock.unlock(); } } public synchronized void deleteTopicInMirrorMaker(String topicName, String src, String dst, String pipeline) throws Exception { _lock.lock(); try { LOGGER.info("Trying to delete topic: {} in pipeline: {}", topicName, pipeline); InstanceTopicPartitionHolder instance = _topicToPipelineInstanceMap.get(topicName).get(pipeline); IdealState currIdealState = getIdealStateForTopic(topicName); if (currIdealState.getPartitionSet().contains(instance.getRouteString()) && currIdealState.getNumPartitions() == 1) { _helixAdmin.dropResource(_helixClusterName, topicName); } else { _helixAdmin.setResourceIdealState(_helixClusterName, topicName, IdealStateBuilder .shrinkCustomIdealStateFor(currIdealState, topicName, instance.getRouteString())); } TopicPartition tp = _srcKafkaValidationManager.getClusterToObserverMap().get(src) .getTopicPartitionWithRefresh(topicName); instance.removeTopicPartition(tp); _topicToPipelineInstanceMap.get(topicName).remove(pipeline); if (_topicToPipelineInstanceMap.get(topicName).keySet().size() == 0) { _topicToPipelineInstanceMap.remove(topicName); } if (instance.getServingTopicPartitionSet().isEmpty()) { _availableControllerList.add(instance.getInstanceName()); _assignedControllerCount.dec(); } } finally { _lock.unlock(); } } private synchronized void setEmptyResourceConfig(String topicName) { _helixAdmin.setConfig(new HelixConfigScopeBuilder(ConfigScopeProperty.RESOURCE) .forCluster(_helixClusterName).forResource(topicName).build(), new HashMap<>()); } public SourceKafkaClusterValidationManager getSrcKafkaValidationManager() { return _srcKafkaValidationManager; } public WorkerHelixManager getWorkerHelixManager() { return _workerHelixManager; } public IuReplicatorConf getConf() { return _conf; } private static String getPipelineFromRoute(String route) { return route.substring(0, route.lastIndexOf("@")); } private static String getSrc(String pipeline) { String[] srcDst = pipeline.split(SEPARATOR); return srcDst[1]; } public void disableAutoScaling() { _enableAutoScaling = false; } public void enableAutoScaling() { _enableAutoScaling = true; } public boolean isAutoScalingEnabled() { return _enableAutoScaling; } public void disableAutoBalancing() { _enableRebalance = false; } public void enableAutoBalancing() { _enableRebalance = true; } public boolean isAutoBalancingEnabled() { return _enableRebalance; } public boolean getControllerAutobalancingStatus(String controllerInstance) throws ControllerException { try { String result = HttpClientUtils.getData(_httpClient, _requestConfig, controllerInstance, _controllerPort, "/admin/" + "autobalancing_status"); return result.equalsIgnoreCase("enabled"); } catch (IOException | URISyntaxException ex) { String msg = String.format("Got error from controller %s when trying to get balancing status", controllerInstance); LOGGER.error(msg, ex); throw new ControllerException(msg, ex); } } /** * RPC call to notify controller to change autobalancing status. * No retry * * @param controllerInstance The controller InstanceName * @param enable whether to enable autobalancing * @return */ public boolean notifyControllerAutobalancing(String controllerInstance, boolean enable) throws ControllerException { //TODO: need to convert to hostname after code change JSONObject entity = new JSONObject(); String cmd = enable ? "enable_autobalancing" : "disable_autobalancing"; try { String result = HttpClientUtils.getData(_httpClient, _requestConfig, controllerInstance, _controllerPort, "/admin/" + cmd); } catch (IOException | URISyntaxException ex) { String msg = String.format("Got error from controller %s when trying to do %s", controllerInstance, cmd); LOGGER.error(msg, ex); throw new ControllerException(msg, ex); } return true; } public Map<String, Integer> getRouteWorkerOverride() { List<ZNRecord> znRecordList = _helixPropertyStore.getChildren(PIPELINE_PATH, null, AccessOption.PERSISTENT); Map<String, Integer> hashMap = new HashMap<>(); if (znRecordList == null) { _helixPropertyStore.create(PIPELINE_PATH, new ZNRecord(""), AccessOption.PERSISTENT); return hashMap; } for (ZNRecord znRecord : znRecordList) { hashMap.put(znRecord.getId(), znRecord.getIntField(WORKER_NUMBER_OVERRIDE, _initMaxNumWorkersPerRoute)); } return hashMap; } public void updateRouteWorkerOverride(String pipeline, Integer value) { String resourcePath = PIPELINE_PATH + "/" + pipeline; ZNRecord znRecord = _helixPropertyStore.get(resourcePath, null, AccessOption.PERSISTENT); if (znRecord == null) { znRecord = new ZNRecord(pipeline); } znRecord.setIntField(WORKER_NUMBER_OVERRIDE, value); _helixPropertyStore.set(resourcePath, znRecord, AccessOption.PERSISTENT); } }