Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.drill.yarn.appMaster; import java.util.ArrayList; import java.util.EnumSet; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Set; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.drill.yarn.appMaster.TaskLifecycleListener.Event; import org.apache.drill.yarn.core.DoYUtil; import org.apache.hadoop.yarn.api.protocolrecords.RegisterApplicationMasterResponse; import org.apache.hadoop.yarn.api.records.Container; import org.apache.hadoop.yarn.api.records.ContainerId; import org.apache.hadoop.yarn.api.records.ContainerStatus; import org.apache.hadoop.yarn.api.records.Resource; import org.apache.hadoop.yarn.proto.YarnServiceProtos.SchedulerResourceTypes; /** * Controls the Drill cluster by representing the current cluster state with a * desired state, taking corrective action to keep the cluster in the desired * state. The cluster as a whole has a state, as do each task (node) within the * cluster. * <p> * This class is designed to allow unit tests. In general, testing the * controller on a live cluster is tedious. This class encapsulates the * controller algorithm so it can be driven by a simulated cluster. * <p> * This object is shared between threads, thus synchronized. */ public class ClusterControllerImpl implements ClusterController { /** * Controller lifecycle state. */ public enum State { /** * Cluster is starting. Things are in a partially-built state. No tasks are * started until the cluster moves to LIVE. */ START, /** * Normal operating state: the controller seeks to maintain the desired * number of tasks. */ LIVE, /** * Controller is shutting down. Tasks are gracefully (where possible) ended; * no new tasks are started. (That is, when we detect the exit of a task, * the controller no longer immediately tries to start a replacement. */ ENDING, /** * The controller has shut down. All tasks and threads are stopped. The * controller allows the main thread (which has been patiently waiting) to * continue, allowing the AM itself to shut down. Thus, this is a very * short-lived state. */ ENDED, /** * Something bad happened on start-up; the AM can't start and must shut * down. */ FAILED } private final static int PRIORITY_OFFSET = 1; private static final Log LOG = LogFactory.getLog(ClusterControllerImpl.class); /** * Signals the completion of the cluster run. The main program waits on this * mutex until all tasks complete (batch) or the cluster is explicitly shut * down (persistent tasks.) */ private Object completionMutex = new Object(); /** * Maximum number of retries for each task launch. */ protected int maxRetries = 3; /** * Controller state. * * @see {@link State} */ State state = State.START; /** * Definition of the task types that can be run by this controller, along with * the target task levels for each. */ private Map<String, SchedulerStateActions> taskPools = new HashMap<>(); /** * List of task pools prioritized in the order in which tasks should start. * DoY supports only one task pool at present. The idea is to, later, support * multiple pools that represent, say, pool 1 as the minimum number of * Drillbits to run at all times, with pool 2 as extra Drillbits to start up * during peak demand. * <p> * The priority also gives rise to YARN request priorities which are the only * tool the AM has to associate container grants with the requests to which * they correspond. */ private List<SchedulerStateActions> prioritizedGroups = new ArrayList<>(); /** * Cluster-wide association of YARN container IDs to tasks. */ private Set<ContainerId> allocatedContainers = new HashSet<>(); /** * Cluster-wide list of active tasks. Allows lookup from container ID to task * (and then from task to task type.) */ private Map<ContainerId, Task> activeContainers = new HashMap<>(); /** * Tracks the tasks that have completed: either successfully (state == ENDED) * or failed (state == FAILED). Eventually store this information elsewhere to * avoid cluttering memory with historical data. Entries here are static * copies, preserving the state at the time that the task completed. */ private List<Task> completedTasks = new LinkedList<>(); /** * Wrapper around the YARN API. Abstracts the details of YARN operations. */ private final AMYarnFacade yarn; /** * Maximum number of new tasks to start on each "pulse" tick. */ private int maxRequestsPerTick = 2; private int stopTimoutMs = 10_000; /** * Time (in ms) between request to YARN to get an updated list of the node * "inventory". */ private int configPollPeriod = 60_000; private long nextResourcePollTime; /** * List of nodes available in the cluster. Necessary as part of the process of * ensuring that we run one Drillbit per node. (The YARN blacklist only half * works for this purpose.) */ private NodeInventory nodeInventory; private long lastFailureCheckTime; private int failureCheckPeriodMs = 60_000; private int taskCheckPeriodMs = 10_000; private long lastTaskCheckTime; /** * To increase code modularity, add-ons (such as the ZK monitor) register as * lifecycle listeners that are alerted to "interesting" lifecycle events. */ private List<TaskLifecycleListener> lifecycleListeners = new ArrayList<>(); /** * Handy mechanism for setting properties on this controller that are * available to plugins and UI without cluttering this class with member * variables. */ private Map<String, Object> properties = new HashMap<>(); /** * When enabled, allows the controller to check for failures that result in no * drillbits running. The controller will then automatically exit as no useful * work can be done. Disable this to make debugging easier on a single-node * cluster (lets you, say, start a "stray" drill bit and see what happens * without the AM exiting.) */ private boolean enableFailureCheck = true; public ClusterControllerImpl(AMYarnFacade yarn) { this.yarn = yarn; } @Override public void enableFailureCheck(boolean flag) { this.enableFailureCheck = flag; } /** * Define a task type. Registration order is important: the controller starts * task in the order that they are registered. Must happen before the YARN * callbacks start. * * @param scheduler */ @Override public void registerScheduler(Scheduler scheduler) { assert !taskPools.containsKey(scheduler.getName()); scheduler.setPriority(taskPools.size() + PRIORITY_OFFSET); SchedulerStateActions taskGroup = new SchedulerStateImpl(this, scheduler); taskPools.put(taskGroup.getName(), taskGroup); prioritizedGroups.add(taskGroup); } /** * Called when the caller has completed start-up and the controller should * become live. */ @Override public synchronized void started() throws YarnFacadeException, AMException { nodeInventory = new NodeInventory(yarn); // Verify that no resource seeks a container larger than // what YARN can provide. Ensures a graceful exit in this // case. Resource maxResource = yarn.getRegistrationResponse().getMaximumResourceCapability(); for (SchedulerStateActions group : prioritizedGroups) { group.getScheduler().limitContainerSize(maxResource); } state = State.LIVE; } @Override public synchronized void tick(long curTime) { if (state == State.LIVE) { adjustTasks(curTime); requestContainers(); } if (state == State.LIVE || state == State.ENDING) { checkTasks(curTime); } } /** * Adjust the number of running tasks to match the desired level. * * @param curTime */ private void adjustTasks(long curTime) { if (enableFailureCheck && getFreeNodeCount() == 0) { checkForFailure(curTime); } if (state != State.LIVE) { return; } for (SchedulerStateActions group : prioritizedGroups) { group.adjustTasks(); } } /** * Get the approximate number of free YARN nodes (those that can * accept a task request.) Starts with the number of nodes from * the node inventory, then subtracts any in-flight requests (which * do not, by definition, have node allocated.) * <p> * This approximation <b>does not</b> consider whether the node * has sufficient resources to run a task; only whether the node * itself exists. * @return The approximate number of free YARN nodes. */ @Override public int getFreeNodeCount() { int count = nodeInventory.getFreeNodeCount(); for (SchedulerStateActions group : prioritizedGroups) { count -= group.getRequestCount(); } return Math.max(0, count); } /** * Check if the controller is unable to run any tasks. If so, and the option * is enabled, then automatically exit since no useful work can be done. * * @param curTime */ private void checkForFailure(long curTime) { if (lastFailureCheckTime + failureCheckPeriodMs > curTime) { return; } lastFailureCheckTime = curTime; for (SchedulerStateActions group : prioritizedGroups) { if (group.getTaskCount() > 0) { return; } } LOG.error("Application failure: no tasks are running and no nodes are available -- exiting."); terminate(State.FAILED); } /** * Periodically check tasks, handling any timeout issues. * * @param curTime */ private void checkTasks(long curTime) { // Check periodically, not on every tick. if (lastTaskCheckTime + taskCheckPeriodMs > curTime) { return; } lastTaskCheckTime = curTime; // Check for task timeouts in states that have a timeout. EventContext context = new EventContext(this); for (SchedulerStateActions group : prioritizedGroups) { context.setGroup(group); group.checkTasks(context, curTime); } } /** * Get an update from YARN on available resources. */ @Override public void updateRMStatus() { long curTime = System.currentTimeMillis(); if (nextResourcePollTime > curTime) { return; } // yarnNodeCount = yarn.getNodeCount(); // LOG.info("YARN reports " + yarnNodeCount + " nodes."); // Resource yarnResources = yarn.getResources(); // if (yarnResources != null) { // LOG.info("YARN reports " + yarnResources.getMemory() + " MB, " + // yarnResources.getVirtualCores() // + " vcores available."); // } nextResourcePollTime = curTime + configPollPeriod; } /** * Request any containers that have accumulated. */ private void requestContainers() { EventContext context = new EventContext(this); for (SchedulerStateActions group : prioritizedGroups) { context.setGroup(group); if (group.requestContainers(context, maxRequestsPerTick)) { break; } } } @Override public synchronized void containersAllocated(List<Container> containers) { EventContext context = new EventContext(this); for (Container container : containers) { if (allocatedContainers.contains(container.getId())) { continue; } // We should never get a container on a node in the blacklist we // sent to YARN. If we do, something is wrong. Log the error and // reject the container. Else, bad things happen further along as // the tracking mechanisms assume one task per node. String host = container.getNodeId().getHost(); if (nodeInventory.isInUse(host)) { LOG.error("Host is in use, but YARN allocated a container: " + DoYUtil.labelContainer(container) + " - container rejected."); yarn.releaseContainer(container); continue; } // The container is fine. allocatedContainers.add(container.getId()); int priority = container.getPriority().getPriority(); int offset = priority - PRIORITY_OFFSET; if (offset < 0 || offset > prioritizedGroups.size()) { LOG.error("Container allocated with unknown priority " + DoYUtil.labelContainer(container)); continue; } context.setGroup(prioritizedGroups.get(offset)); context.group.containerAllocated(context, container); } } @Override public synchronized void containerStarted(ContainerId containerId) { Task task = getTask(containerId); if (task == null) { return; } EventContext context = new EventContext(this, task); context.getState().containerStarted(context); LOG.trace("Container started: " + containerId); } @Override public synchronized void taskStartFailed(ContainerId containerId, Throwable t) { Task task = getTask(containerId); if (task == null) { return; } EventContext context = new EventContext(this, task); context.getState().launchFailed(context, t); } private Task getTask(ContainerId containerId) { return activeContainers.get(containerId); } @Override public synchronized void containerStopped(ContainerId containerId) { // Ignored because the node manager notification is very // unreliable. Better to rely on the Resource Manager // completion request. // Task task = getTask(containerId); // if (task == null) { // return; } // EventContext context = new EventContext(this, task); // context.getState().containerStopped(context); } @Override public synchronized void containersCompleted(List<ContainerStatus> statuses) { EventContext context = new EventContext(this); for (ContainerStatus status : statuses) { Task task = getTask(status.getContainerId()); if (task == null) { if (task == null) { // Will occur if a container was allocated but rejected. // Any other occurrence is unexpected and an error. LOG.warn("Container completed but no associated task state: " + status.getContainerId()); } continue; } context.setTask(task); context.getState().containerCompleted(context, status); } checkStatus(); } @Override public synchronized float getProgress() { int numerator = 0; int denominator = 0; for (SchedulerStateActions group : taskPools.values()) { Scheduler sched = group.getScheduler(); int[] progress = sched.getProgress(); numerator += progress[0]; denominator += progress[1]; } if (numerator == 0) { return 1; } return (float) denominator / (float) numerator; } @Override public synchronized void stopTaskFailed(ContainerId containerId, Throwable t) { Task task = getTask(containerId); if (task == null) { return; } EventContext context = new EventContext(this, task); context.getState().stopTaskFailed(context, t); } @Override public synchronized void resizeDelta(int delta) { // TODO: offer the delta to each scheduler in turn. // For now, we support only one scheduler. prioritizedGroups.get(0).getScheduler().change(delta); } @Override public synchronized int resizeTo(int n) { // TODO: offer the delta to each scheduler in turn. // For now, we support only one scheduler. return prioritizedGroups.get(0).getScheduler().resize(n); } @Override public synchronized void shutDown() { LOG.info("Shut down request received"); this.state = State.ENDING; EventContext context = new EventContext(this); for (SchedulerStateActions group : prioritizedGroups) { group.shutDown(context); } checkStatus(); } @Override public boolean waitForCompletion() { start(); synchronized (completionMutex) { try { completionMutex.wait(); LOG.info("Controller shut down completed"); } catch (InterruptedException e) { // Should not happen } } return succeeded(); } private void start() { yarnReport(); } private void yarnReport() { RegisterApplicationMasterResponse response = yarn.getRegistrationResponse(); LOG.info("YARN queue: " + response.getQueue()); Resource resource = response.getMaximumResourceCapability(); LOG.info("YARN max resource: " + resource.getMemory() + " MB, " + resource.getVirtualCores() + " cores"); EnumSet<SchedulerResourceTypes> types = response.getSchedulerResourceTypes(); StringBuilder buf = new StringBuilder(); String sep = ""; for (SchedulerResourceTypes type : types) { buf.append(sep); buf.append(type.toString()); sep = ", "; } LOG.info("YARN scheduler resource types: " + buf.toString()); } /** * Check for overall completion. We are done when either we've successfully * run all tasks, or we've run some and given up on others. We're done when * the number of completed or failed tasks reaches our target. */ private void checkStatus() { if (state != State.ENDING) { return; } for (SchedulerStateActions group : prioritizedGroups) { if (!group.isDone()) { return; } } terminate(State.ENDED); } private void terminate(State state) { this.state = state; synchronized (completionMutex) { completionMutex.notify(); } } public boolean isLive() { return state == State.LIVE; } public boolean succeeded() { return state == State.ENDED; } public void containerAllocated(Task task) { activeContainers.put(task.getContainerId(), task); } public AMYarnFacade getYarn() { return yarn; } public void containerReleased(Task task) { activeContainers.remove(task.getContainerId()); } public void taskEnded(Task task) { completedTasks.add(task); } public void taskRetried(Task task) { Task copy = task.copy(); copy.disposition = Task.Disposition.RETRIED; completedTasks.add(copy); } public void taskGroupCompleted(SchedulerStateActions taskGroup) { checkStatus(); } public int getMaxRetries() { return maxRetries; } public int getStopTimeoutMs() { return stopTimoutMs; } @Override public synchronized void reserveHost(String hostName) { nodeInventory.reserve(hostName); } @Override public synchronized void releaseHost(String hostName) { nodeInventory.release(hostName); } public NodeInventory getNodeInventory() { return nodeInventory; } @Override public void setProperty(String key, Object value) { properties.put(key, value); } @Override public Object getProperty(String key) { return properties.get(key); } @Override public void registerLifecycleListener(TaskLifecycleListener listener) { lifecycleListeners.add(listener); } public void fireLifecycleChange(Event event, EventContext context) { for (TaskLifecycleListener listener : lifecycleListeners) { listener.stateChange(event, context); } } @Override public void setMaxRetries(int value) { maxRetries = value; } @Override public int getTargetCount() { int count = 0; for (SchedulerStateActions group : prioritizedGroups) { count += group.getScheduler().getTarget(); } return count; } public State getState() { return state; } @Override public synchronized void visit(ControllerVisitor visitor) { visitor.visit(this); } public List<SchedulerStateActions> getPools() { return prioritizedGroups; } @Override public synchronized void visitTasks(TaskVisitor visitor) { for (SchedulerStateActions pool : prioritizedGroups) { pool.visitTaskModels(visitor); } } public List<Task> getHistory() { return completedTasks; } @Override public boolean isTaskLive(int id) { for (SchedulerStateActions group : prioritizedGroups) { Task task = group.getTask(id); if (task != null) { return task.isLive(); } } return false; } @Override public synchronized boolean cancelTask(int id) { for (SchedulerStateActions group : prioritizedGroups) { Task task = group.getTask(id); if (task != null) { group.cancel(task); group.getScheduler().change(-1); return true; } } LOG.warn("Requested to cancel task, but no task found: " + id); return false; } @Override public synchronized void completionAck(Task task, String propertyKey) { EventContext context = new EventContext(this); context.setTask(task); context.getState().completionAck(context); if (propertyKey != null) { task.properties.remove(propertyKey); } } @Override public synchronized void startAck(Task task, String propertyKey, Object value) { if (propertyKey != null && value != null) { task.properties.put(propertyKey, value); } EventContext context = new EventContext(this); context.setTask(task); context.getState().startAck(context); } @Override public boolean supportsDiskResource() { return getYarn().supportsDiskResource(); } @Override public void registryDown() { shutDown(); } }