io.druid.indexing.overlord.RemoteTaskRunner.java Source code

Java tutorial

Introduction

Here is the source code for io.druid.indexing.overlord.RemoteTaskRunner.java

Source

/*
 * Druid - a distributed column store.
 * Copyright 2012 - 2015 Metamarkets Group Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package io.druid.indexing.overlord;

import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.base.Charsets;
import com.google.common.base.Joiner;
import com.google.common.base.Optional;
import com.google.common.base.Preconditions;
import com.google.common.base.Predicate;
import com.google.common.base.Stopwatch;
import com.google.common.base.Supplier;
import com.google.common.base.Throwables;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.io.ByteSource;
import com.google.common.util.concurrent.FutureCallback;
import com.google.common.util.concurrent.Futures;
import com.google.common.util.concurrent.ListenableFuture;
import com.google.common.util.concurrent.SettableFuture;
import com.metamx.common.ISE;
import com.metamx.common.lifecycle.LifecycleStart;
import com.metamx.common.lifecycle.LifecycleStop;
import com.metamx.emitter.EmittingLogger;
import com.metamx.http.client.HttpClient;
import com.metamx.http.client.Request;
import com.metamx.http.client.response.InputStreamResponseHandler;
import com.metamx.http.client.response.StatusResponseHandler;
import com.metamx.http.client.response.StatusResponseHolder;
import io.druid.curator.cache.PathChildrenCacheFactory;
import io.druid.indexing.common.TaskStatus;
import io.druid.indexing.common.task.Task;
import io.druid.indexing.overlord.config.RemoteTaskRunnerConfig;
import io.druid.indexing.overlord.setup.WorkerBehaviorConfig;
import io.druid.indexing.overlord.setup.WorkerSelectStrategy;
import io.druid.indexing.worker.TaskAnnouncement;
import io.druid.indexing.worker.Worker;
import io.druid.server.initialization.IndexerZkConfig;
import io.druid.tasklogs.TaskLogStreamer;
import org.apache.commons.lang.mutable.MutableInt;
import org.apache.curator.framework.CuratorFramework;
import org.apache.curator.framework.recipes.cache.PathChildrenCache;
import org.apache.curator.framework.recipes.cache.PathChildrenCacheEvent;
import org.apache.curator.framework.recipes.cache.PathChildrenCacheListener;
import org.apache.curator.utils.ZKPaths;
import org.apache.zookeeper.CreateMode;
import org.apache.zookeeper.KeeperException;
import org.jboss.netty.handler.codec.http.HttpMethod;
import org.jboss.netty.handler.codec.http.HttpResponseStatus;
import org.joda.time.DateTime;

import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.concurrent.Callable;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.ScheduledFuture;
import java.util.concurrent.TimeUnit;

/**
 * The RemoteTaskRunner's primary responsibility is to assign tasks to worker nodes.
 * The RemoteTaskRunner uses Zookeeper to keep track of which workers are running which tasks. Tasks are assigned by
 * creating ephemeral nodes in ZK that workers must remove. Workers announce the statuses of the tasks they are running.
 * Once a task completes, it is up to the RTR to remove the task status and run any necessary cleanup.
 * The RemoteTaskRunner is event driven and updates state according to ephemeral node changes in ZK.
 * <p/>
 * The RemoteTaskRunner will assign tasks to a node until the node hits capacity. At that point, task assignment will
 * fail. The RemoteTaskRunner depends on another component to create additional worker resources.
 * For example, {@link io.druid.indexing.overlord.autoscaling.ResourceManagementScheduler} can take care of these duties.
 * <p/>
 * If a worker node becomes inexplicably disconnected from Zk, the RemoteTaskRunner will fail any tasks associated with the
 * worker after waiting for RemoteTaskRunnerConfig.taskCleanupTimeout for the worker to show up.
 * <p/>
 * The RemoteTaskRunner uses ZK for job management and assignment and http for IPC messages.
 */
public class RemoteTaskRunner implements TaskRunner, TaskLogStreamer {
    private static final EmittingLogger log = new EmittingLogger(RemoteTaskRunner.class);
    private static final StatusResponseHandler RESPONSE_HANDLER = new StatusResponseHandler(Charsets.UTF_8);
    private static final Joiner JOINER = Joiner.on("/");

    private final ObjectMapper jsonMapper;
    private final RemoteTaskRunnerConfig config;
    private final IndexerZkConfig indexerZkConfig;
    private final CuratorFramework cf;
    private final PathChildrenCacheFactory pathChildrenCacheFactory;
    private final PathChildrenCache workerPathCache;
    private final HttpClient httpClient;
    private final Supplier<WorkerBehaviorConfig> workerConfigRef;

    // all workers that exist in ZK
    private final ConcurrentMap<String, ZkWorker> zkWorkers = new ConcurrentHashMap<>();
    // payloads of pending tasks, which we remember just long enough to assign to workers
    private final ConcurrentMap<String, Task> pendingTaskPayloads = new ConcurrentHashMap<>();
    // tasks that have not yet been assigned to a worker
    private final RemoteTaskRunnerWorkQueue pendingTasks = new RemoteTaskRunnerWorkQueue();
    // all tasks that have been assigned to a worker
    private final RemoteTaskRunnerWorkQueue runningTasks = new RemoteTaskRunnerWorkQueue();
    // tasks that are complete but not cleaned up yet
    private final RemoteTaskRunnerWorkQueue completeTasks = new RemoteTaskRunnerWorkQueue();

    private final ExecutorService runPendingTasksExec = Executors.newSingleThreadExecutor();

    // Workers that have been marked as lazy. these workers are not running any tasks and can be terminated safely by the scaling policy.
    private final ConcurrentMap<String, ZkWorker> lazyWorkers = new ConcurrentHashMap<>();

    private final Object statusLock = new Object();

    private volatile boolean started = false;

    private final ScheduledExecutorService cleanupExec;

    private final ConcurrentMap<String, ScheduledFuture> removedWorkerCleanups = new ConcurrentHashMap<>();

    public RemoteTaskRunner(ObjectMapper jsonMapper, RemoteTaskRunnerConfig config, IndexerZkConfig indexerZkConfig,
            CuratorFramework cf, PathChildrenCacheFactory pathChildrenCacheFactory, HttpClient httpClient,
            Supplier<WorkerBehaviorConfig> workerConfigRef, ScheduledExecutorService cleanupExec) {
        this.jsonMapper = jsonMapper;
        this.config = config;
        this.indexerZkConfig = indexerZkConfig;
        this.cf = cf;
        this.pathChildrenCacheFactory = pathChildrenCacheFactory;
        this.workerPathCache = pathChildrenCacheFactory.make(cf, indexerZkConfig.getAnnouncementsPath());
        this.httpClient = httpClient;
        this.workerConfigRef = workerConfigRef;
        this.cleanupExec = cleanupExec;
    }

    @LifecycleStart
    public void start() {
        try {
            if (started) {
                return;
            }

            final MutableInt waitingFor = new MutableInt(1);
            final Object waitingForMonitor = new Object();

            // Add listener for creation/deletion of workers
            workerPathCache.getListenable().addListener(new PathChildrenCacheListener() {
                @Override
                public void childEvent(CuratorFramework client, final PathChildrenCacheEvent event)
                        throws Exception {
                    final Worker worker;
                    switch (event.getType()) {
                    case CHILD_ADDED:
                        worker = jsonMapper.readValue(event.getData().getData(), Worker.class);
                        synchronized (waitingForMonitor) {
                            waitingFor.increment();
                        }
                        Futures.addCallback(addWorker(worker), new FutureCallback<ZkWorker>() {
                            @Override
                            public void onSuccess(ZkWorker zkWorker) {
                                synchronized (waitingForMonitor) {
                                    waitingFor.decrement();
                                    waitingForMonitor.notifyAll();
                                }
                            }

                            @Override
                            public void onFailure(Throwable throwable) {
                                synchronized (waitingForMonitor) {
                                    waitingFor.decrement();
                                    waitingForMonitor.notifyAll();
                                }
                            }
                        });
                        break;
                    case CHILD_UPDATED:
                        worker = jsonMapper.readValue(event.getData().getData(), Worker.class);
                        updateWorker(worker);
                        break;

                    case CHILD_REMOVED:
                        worker = jsonMapper.readValue(event.getData().getData(), Worker.class);
                        removeWorker(worker);
                        break;
                    case INITIALIZED:
                        synchronized (waitingForMonitor) {
                            waitingFor.decrement();
                            waitingForMonitor.notifyAll();
                        }
                    default:
                        break;
                    }
                }
            });
            workerPathCache.start(PathChildrenCache.StartMode.POST_INITIALIZED_EVENT);
            synchronized (waitingForMonitor) {
                while (waitingFor.intValue() > 0) {
                    waitingForMonitor.wait();
                }
            }
            // Schedule cleanup for task status of the workers that might have disconnected while overlord was not running
            List<String> workers;
            try {
                workers = cf.getChildren().forPath(indexerZkConfig.getStatusPath());
            } catch (KeeperException.NoNodeException e) {
                // statusPath doesn't exist yet; can occur if no middleManagers have started.
                workers = ImmutableList.of();
            }
            for (String worker : workers) {
                if (!zkWorkers.containsKey(worker) && cf.checkExists()
                        .forPath(JOINER.join(indexerZkConfig.getAnnouncementsPath(), worker)) == null) {
                    scheduleTasksCleanupForWorker(worker,
                            cf.getChildren().forPath(JOINER.join(indexerZkConfig.getStatusPath(), worker)));
                }
            }

            started = true;
        } catch (Exception e) {
            throw Throwables.propagate(e);
        }
    }

    @LifecycleStop
    public void stop() {
        try {
            if (!started) {
                return;
            }
            started = false;
            for (ZkWorker zkWorker : zkWorkers.values()) {
                zkWorker.close();
            }
            workerPathCache.close();
        } catch (Exception e) {
            throw Throwables.propagate(e);
        }
    }

    @Override
    public Collection<ZkWorker> getWorkers() {
        return ImmutableList.copyOf(zkWorkers.values());
    }

    @Override
    public Collection<RemoteTaskRunnerWorkItem> getRunningTasks() {
        return ImmutableList.copyOf(runningTasks.values());
    }

    @Override
    public Collection<RemoteTaskRunnerWorkItem> getPendingTasks() {
        return ImmutableList.copyOf(pendingTasks.values());
    }

    @Override
    public Collection<RemoteTaskRunnerWorkItem> getKnownTasks() {
        // Racey, since there is a period of time during assignment when a task is neither pending nor running
        return ImmutableList
                .copyOf(Iterables.concat(pendingTasks.values(), runningTasks.values(), completeTasks.values()));
    }

    public ZkWorker findWorkerRunningTask(String taskId) {
        for (ZkWorker zkWorker : zkWorkers.values()) {
            if (zkWorker.isRunningTask(taskId)) {
                return zkWorker;
            }
        }
        return null;
    }

    public boolean isWorkerRunningTask(Worker worker, String taskId) {
        ZkWorker zkWorker = zkWorkers.get(worker.getHost());
        return (zkWorker != null && zkWorker.isRunningTask(taskId));
    }

    /**
     * A task will be run only if there is no current knowledge in the RemoteTaskRunner of the task.
     *
     * @param task task to run
     */
    @Override
    public ListenableFuture<TaskStatus> run(final Task task) {
        final RemoteTaskRunnerWorkItem completeTask, runningTask, pendingTask;
        if ((pendingTask = pendingTasks.get(task.getId())) != null) {
            log.info("Assigned a task[%s] that is already pending, not doing anything", task.getId());
            return pendingTask.getResult();
        } else if ((runningTask = runningTasks.get(task.getId())) != null) {
            ZkWorker zkWorker = findWorkerRunningTask(task.getId());
            if (zkWorker == null) {
                log.warn("Told to run task[%s], but no worker has started running it yet.", task.getId());
            } else {
                log.info("Task[%s] already running on %s.", task.getId(), zkWorker.getWorker().getHost());
                TaskAnnouncement announcement = zkWorker.getRunningTasks().get(task.getId());
                if (announcement.getTaskStatus().isComplete()) {
                    taskComplete(runningTask, zkWorker, announcement.getTaskStatus());
                }
            }
            return runningTask.getResult();
        } else if ((completeTask = completeTasks.get(task.getId())) != null) {
            return completeTask.getResult();
        } else {
            return addPendingTask(task).getResult();
        }
    }

    /**
     * Finds the worker running the task and forwards the shutdown signal to the worker.
     *
     * @param taskId - task id to shutdown
     */
    @Override
    public void shutdown(final String taskId) {
        if (!started) {
            log.info("This TaskRunner is stopped. Ignoring shutdown command for task: %s", taskId);
        } else if (pendingTasks.remove(taskId) != null) {
            pendingTaskPayloads.remove(taskId);
            log.info("Removed task from pending queue: %s", taskId);
        } else if (completeTasks.containsKey(taskId)) {
            cleanup(taskId);
        } else {
            final ZkWorker zkWorker = findWorkerRunningTask(taskId);

            if (zkWorker == null) {
                log.info("Can't shutdown! No worker running task %s", taskId);
                return;
            }

            try {
                final URL url = makeWorkerURL(zkWorker.getWorker(), String.format("/task/%s/shutdown", taskId));
                final StatusResponseHolder response = httpClient
                        .go(new Request(HttpMethod.POST, url), RESPONSE_HANDLER).get();

                log.info("Sent shutdown message to worker: %s, status %s, response: %s",
                        zkWorker.getWorker().getHost(), response.getStatus(), response.getContent());

                if (!response.getStatus().equals(HttpResponseStatus.ACCEPTED)) {
                    log.error("Shutdown failed for %s! Are you sure the task was running?", taskId);
                }
            } catch (Exception e) {
                throw Throwables.propagate(e);
            }
        }
    }

    @Override
    public Optional<ByteSource> streamTaskLog(final String taskId, final long offset) {
        final ZkWorker zkWorker = findWorkerRunningTask(taskId);

        if (zkWorker == null) {
            // Worker is not running this task, it might be available in deep storage
            return Optional.absent();
        } else {
            // Worker is still running this task
            final URL url = makeWorkerURL(zkWorker.getWorker(),
                    String.format("/task/%s/log?offset=%d", taskId, offset));
            return Optional.<ByteSource>of(new ByteSource() {
                @Override
                public InputStream openStream() throws IOException {
                    try {
                        return httpClient.go(new Request(HttpMethod.GET, url), new InputStreamResponseHandler())
                                .get();
                    } catch (InterruptedException e) {
                        throw Throwables.propagate(e);
                    } catch (ExecutionException e) {
                        // Unwrap if possible
                        Throwables.propagateIfPossible(e.getCause(), IOException.class);
                        throw Throwables.propagate(e);
                    }
                }
            });
        }
    }

    private URL makeWorkerURL(Worker worker, String path) {
        Preconditions.checkArgument(path.startsWith("/"), "path must start with '/': %s", path);

        try {
            return new URL(String.format("http://%s/druid/worker/v1%s", worker.getHost(), path));
        } catch (MalformedURLException e) {
            throw Throwables.propagate(e);
        }
    }

    /**
     * Adds a task to the pending queue
     */
    private RemoteTaskRunnerWorkItem addPendingTask(final Task task) {
        log.info("Added pending task %s", task.getId());
        final RemoteTaskRunnerWorkItem taskRunnerWorkItem = new RemoteTaskRunnerWorkItem(task.getId(), null);
        pendingTaskPayloads.put(task.getId(), task);
        pendingTasks.put(task.getId(), taskRunnerWorkItem);
        runPendingTasks();
        return taskRunnerWorkItem;
    }

    /**
     * This method uses a single threaded executor to extract all pending tasks and attempt to run them. Any tasks that
     * are successfully assigned to a worker will be moved from pendingTasks to runningTasks. This method is thread-safe.
     * This method should be run each time there is new worker capacity or if new tasks are assigned.
     */
    private void runPendingTasks() {
        runPendingTasksExec.submit(new Callable<Void>() {
            @Override
            public Void call() throws Exception {
                try {
                    // make a copy of the pending tasks because tryAssignTask may delete tasks from pending and move them
                    // into running status
                    List<RemoteTaskRunnerWorkItem> copy = Lists.newArrayList(pendingTasks.values());
                    for (RemoteTaskRunnerWorkItem taskRunnerWorkItem : copy) {
                        String taskId = taskRunnerWorkItem.getTaskId();
                        try {
                            if (tryAssignTask(pendingTaskPayloads.get(taskId), taskRunnerWorkItem)) {
                                pendingTaskPayloads.remove(taskId);
                            }
                        } catch (Exception e) {
                            log.makeAlert(e, "Exception while trying to assign task")
                                    .addData("taskId", taskRunnerWorkItem.getTaskId()).emit();
                            RemoteTaskRunnerWorkItem workItem = pendingTasks.remove(taskId);
                            taskComplete(workItem, null, TaskStatus.failure(taskId));
                        }
                    }
                } catch (Exception e) {
                    log.makeAlert(e, "Exception in running pending tasks").emit();
                }

                return null;
            }
        });
    }

    /**
     * Removes a task from the complete queue and clears out the ZK status path of the task.
     *
     * @param taskId - the task to cleanup
     */
    private void cleanup(final String taskId) {
        if (!started) {
            return;
        }
        final RemoteTaskRunnerWorkItem removed = completeTasks.remove(taskId);
        final Worker worker = removed.getWorker();
        if (removed == null || worker == null) {
            log.makeAlert("WTF?! Asked to cleanup nonexistent task").addData("taskId", taskId).emit();
        } else {
            final String workerId = worker.getHost();
            log.info("Cleaning up task[%s] on worker[%s]", taskId, workerId);
            final String statusPath = JOINER.join(indexerZkConfig.getStatusPath(), workerId, taskId);
            try {
                cf.delete().guaranteed().forPath(statusPath);
            } catch (KeeperException.NoNodeException e) {
                log.info("Tried to delete status path[%s] that didn't exist! Must've gone away already?",
                        statusPath);
            } catch (Exception e) {
                throw Throwables.propagate(e);
            }
        }
    }

    /**
     * Ensures no workers are already running a task before assigning the task to a worker.
     * It is possible that a worker is running a task that the RTR has no knowledge of. This occurs when the RTR
     * needs to bootstrap after a restart.
     *
     * @param taskRunnerWorkItem - the task to assign
     *
     * @return true iff the task is now assigned
     */
    private boolean tryAssignTask(final Task task, final RemoteTaskRunnerWorkItem taskRunnerWorkItem)
            throws Exception {
        Preconditions.checkNotNull(task, "task");
        Preconditions.checkNotNull(taskRunnerWorkItem, "taskRunnerWorkItem");
        Preconditions.checkArgument(task.getId().equals(taskRunnerWorkItem.getTaskId()), "task id != workItem id");

        if (runningTasks.containsKey(task.getId()) || findWorkerRunningTask(task.getId()) != null) {
            log.info("Task[%s] already running.", task.getId());
            return true;
        } else {
            // Nothing running this task, announce it in ZK for a worker to run it
            WorkerBehaviorConfig workerConfig = workerConfigRef.get();
            WorkerSelectStrategy strategy;
            if (workerConfig == null || workerConfig.getSelectStrategy() == null) {
                log.warn("No worker selections strategy set. Using default.");
                strategy = WorkerBehaviorConfig.DEFAULT_STRATEGY;
            } else {
                strategy = workerConfig.getSelectStrategy();
            }
            final Optional<ImmutableZkWorker> immutableZkWorker = strategy.findWorkerForTask(config,
                    ImmutableMap.copyOf(Maps.transformEntries(
                            Maps.filterEntries(zkWorkers, new Predicate<Map.Entry<String, ZkWorker>>() {
                                @Override
                                public boolean apply(Map.Entry<String, ZkWorker> input) {
                                    return !lazyWorkers.containsKey(input.getKey());
                                }
                            }), new Maps.EntryTransformer<String, ZkWorker, ImmutableZkWorker>() {
                                @Override
                                public ImmutableZkWorker transformEntry(String key, ZkWorker value) {
                                    return value.toImmutable();
                                }
                            })),
                    task);
            if (immutableZkWorker.isPresent()) {
                final ZkWorker zkWorker = zkWorkers.get(immutableZkWorker.get().getWorker().getHost());
                return announceTask(task, zkWorker, taskRunnerWorkItem);
            } else {
                log.debug("Worker nodes %s do not have capacity to run any more tasks!", zkWorkers.values());
                return false;
            }
        }
    }

    /**
     * Creates a ZK entry under a specific path associated with a worker. The worker is responsible for
     * removing the task ZK entry and creating a task status ZK entry.
     *
     * @param theZkWorker        The worker the task is assigned to
     * @param taskRunnerWorkItem The task to be assigned
     *
     * @return boolean indicating whether the task was successfully assigned or not
     */
    private boolean announceTask(final Task task, final ZkWorker theZkWorker,
            final RemoteTaskRunnerWorkItem taskRunnerWorkItem) throws Exception {
        Preconditions.checkArgument(task.getId().equals(taskRunnerWorkItem.getTaskId()), "task id != workItem id");
        final String worker = theZkWorker.getWorker().getHost();
        synchronized (statusLock) {
            if (!zkWorkers.containsKey(worker) || lazyWorkers.containsKey(worker)) {
                // the worker might got killed or has been marked as lazy.
                log.info("Not assigning task to already removed worker[%s]", worker);
                return false;
            }
            log.info("Coordinator asking Worker[%s] to add task[%s]", worker, task.getId());

            byte[] rawBytes = jsonMapper.writeValueAsBytes(task);
            if (rawBytes.length > config.getMaxZnodeBytes()) {
                throw new ISE("Length of raw bytes for task too large[%,d > %,d]", rawBytes.length,
                        config.getMaxZnodeBytes());
            }

            String taskPath = JOINER.join(indexerZkConfig.getTasksPath(), worker, task.getId());

            if (cf.checkExists().forPath(taskPath) == null) {
                cf.create().withMode(CreateMode.EPHEMERAL).forPath(taskPath, rawBytes);
            }

            RemoteTaskRunnerWorkItem workItem = pendingTasks.remove(task.getId());
            if (workItem == null) {
                log.makeAlert("WTF?! Got a null work item from pending tasks?! How can this be?!")
                        .addData("taskId", task.getId()).emit();
                return false;
            }

            RemoteTaskRunnerWorkItem newWorkItem = workItem.withWorker(theZkWorker.getWorker());
            runningTasks.put(task.getId(), newWorkItem);
            log.info("Task %s switched from pending to running (on [%s])", task.getId(),
                    newWorkItem.getWorker().getHost());

            // Syncing state with Zookeeper - don't assign new tasks until the task we just assigned is actually running
            // on a worker - this avoids overflowing a worker with tasks
            Stopwatch timeoutStopwatch = Stopwatch.createStarted();
            while (!isWorkerRunningTask(theZkWorker.getWorker(), task.getId())) {
                final long waitMs = config.getTaskAssignmentTimeout().toStandardDuration().getMillis();
                statusLock.wait(waitMs);
                long elapsed = timeoutStopwatch.elapsed(TimeUnit.MILLISECONDS);
                if (elapsed >= waitMs) {
                    log.error("Something went wrong! [%s] never ran task [%s]! Timeout: (%s >= %s)!", worker,
                            task.getId(), elapsed, config.getTaskAssignmentTimeout());
                    taskComplete(taskRunnerWorkItem, theZkWorker, TaskStatus.failure(task.getId()));
                    break;
                }
            }
            return true;
        }
    }

    /**
     * When a new worker appears, listeners are registered for status changes associated with tasks assigned to
     * the worker. Status changes indicate the creation or completion of a task.
     * The RemoteTaskRunner updates state according to these changes.
     *
     * @param worker contains metadata for a worker that has appeared in ZK
     *
     * @return future that will contain a fully initialized worker
     */
    private ListenableFuture<ZkWorker> addWorker(final Worker worker) {
        log.info("Worker[%s] reportin' for duty!", worker.getHost());

        try {
            ScheduledFuture previousCleanup = removedWorkerCleanups.remove(worker.getHost());
            if (previousCleanup != null) {
                log.info("Cancelling Worker[%s] scheduled task cleanup", worker.getHost());
                previousCleanup.cancel(false);
            }

            final String workerStatusPath = JOINER.join(indexerZkConfig.getStatusPath(), worker.getHost());
            final PathChildrenCache statusCache = pathChildrenCacheFactory.make(cf, workerStatusPath);
            final SettableFuture<ZkWorker> retVal = SettableFuture.create();
            final ZkWorker zkWorker = new ZkWorker(worker, statusCache, jsonMapper);

            // Add status listener to the watcher for status changes
            zkWorker.addListener(new PathChildrenCacheListener() {
                @Override
                public void childEvent(CuratorFramework client, PathChildrenCacheEvent event) throws Exception {
                    final String taskId;
                    final RemoteTaskRunnerWorkItem taskRunnerWorkItem;
                    synchronized (statusLock) {
                        try {
                            switch (event.getType()) {
                            case CHILD_ADDED:
                            case CHILD_UPDATED:
                                taskId = ZKPaths.getNodeFromPath(event.getData().getPath());
                                final TaskStatus taskStatus = jsonMapper.readValue(event.getData().getData(),
                                        TaskStatus.class);

                                log.info("Worker[%s] wrote %s status for task: %s", zkWorker.getWorker().getHost(),
                                        taskStatus.getStatusCode(), taskId);

                                // Synchronizing state with ZK
                                statusLock.notifyAll();

                                final RemoteTaskRunnerWorkItem tmp;
                                if ((tmp = runningTasks.get(taskId)) != null) {
                                    taskRunnerWorkItem = tmp;
                                } else {
                                    final RemoteTaskRunnerWorkItem newTaskRunnerWorkItem = new RemoteTaskRunnerWorkItem(
                                            taskId, zkWorker.getWorker());
                                    final RemoteTaskRunnerWorkItem existingItem = runningTasks.putIfAbsent(taskId,
                                            newTaskRunnerWorkItem);
                                    if (existingItem == null) {
                                        log.warn(
                                                "Worker[%s] announced a status for a task I didn't know about, adding to runningTasks: %s",
                                                zkWorker.getWorker().getHost(), taskId);
                                        taskRunnerWorkItem = newTaskRunnerWorkItem;
                                    } else {
                                        taskRunnerWorkItem = existingItem;
                                    }
                                }

                                if (taskStatus.isComplete()) {
                                    taskComplete(taskRunnerWorkItem, zkWorker, taskStatus);
                                    runPendingTasks();
                                }
                                break;
                            case CHILD_REMOVED:
                                taskId = ZKPaths.getNodeFromPath(event.getData().getPath());
                                taskRunnerWorkItem = runningTasks.remove(taskId);
                                if (taskRunnerWorkItem != null) {
                                    log.info("Task[%s] just disappeared!", taskId);
                                    taskRunnerWorkItem
                                            .setResult(TaskStatus.failure(taskRunnerWorkItem.getTaskId()));
                                } else {
                                    log.info("Task[%s] went bye bye.", taskId);
                                }
                                break;
                            case INITIALIZED:
                                if (zkWorkers.putIfAbsent(worker.getHost(), zkWorker) == null) {
                                    retVal.set(zkWorker);
                                } else {
                                    final String message = String.format(
                                            "WTF?! Tried to add already-existing worker[%s]", worker.getHost());
                                    log.makeAlert(message).addData("workerHost", worker.getHost())
                                            .addData("workerIp", worker.getIp()).emit();
                                    retVal.setException(new IllegalStateException(message));
                                }
                                runPendingTasks();
                            }
                        } catch (Exception e) {
                            log.makeAlert(e, "Failed to handle new worker status")
                                    .addData("worker", zkWorker.getWorker().getHost())
                                    .addData("znode", event.getData().getPath()).emit();
                        }
                    }
                }
            });
            zkWorker.start();
            return retVal;
        } catch (Exception e) {
            throw Throwables.propagate(e);
        }
    }

    /**
     * We allow workers to change their own capacities and versions. They cannot change their own hosts or ips without
     * dropping themselves and re-announcing.
     */
    private void updateWorker(final Worker worker) {
        final ZkWorker zkWorker = zkWorkers.get(worker.getHost());
        if (zkWorker != null) {
            log.info("Worker[%s] updated its announcement from[%s] to[%s].", worker.getHost(), zkWorker.getWorker(),
                    worker);
            zkWorker.setWorker(worker);
        } else {
            log.warn("WTF, worker[%s] updated its announcement but we didn't have a ZkWorker for it. Ignoring.",
                    worker.getHost());
        }
    }

    /**
     * When a ephemeral worker node disappears from ZK, incomplete running tasks will be retried by
     * the logic in the status listener. We still have to make sure there are no tasks assigned
     * to the worker but not yet running.
     *
     * @param worker - the removed worker
     */
    private void removeWorker(final Worker worker) {
        log.info("Kaboom! Worker[%s] removed!", worker.getHost());

        final ZkWorker zkWorker = zkWorkers.get(worker.getHost());
        if (zkWorker != null) {
            try {
                scheduleTasksCleanupForWorker(worker.getHost(), getAssignedTasks(worker));
            } catch (Exception e) {
                throw Throwables.propagate(e);
            } finally {
                try {
                    zkWorker.close();
                } catch (Exception e) {
                    log.error(e, "Exception closing worker[%s]!", worker.getHost());
                }
                zkWorkers.remove(worker.getHost());
            }
        }
        lazyWorkers.remove(worker.getHost());
    }

    private void scheduleTasksCleanupForWorker(final String worker, final List<String> tasksToFail) {
        removedWorkerCleanups.put(worker, cleanupExec.schedule(new Runnable() {
            @Override
            public void run() {
                log.info("Running scheduled cleanup for Worker[%s]", worker);
                try {
                    for (String assignedTask : tasksToFail) {
                        String taskPath = JOINER.join(indexerZkConfig.getTasksPath(), worker, assignedTask);
                        String statusPath = JOINER.join(indexerZkConfig.getStatusPath(), worker, assignedTask);
                        if (cf.checkExists().forPath(taskPath) != null) {
                            cf.delete().guaranteed().forPath(taskPath);
                        }

                        if (cf.checkExists().forPath(statusPath) != null) {
                            cf.delete().guaranteed().forPath(statusPath);
                        }

                        log.info("Failing task[%s]", assignedTask);
                        RemoteTaskRunnerWorkItem taskRunnerWorkItem = runningTasks.remove(assignedTask);
                        if (taskRunnerWorkItem != null) {
                            taskRunnerWorkItem.setResult(TaskStatus.failure(taskRunnerWorkItem.getTaskId()));
                        } else {
                            log.warn("RemoteTaskRunner has no knowledge of task[%s]", assignedTask);
                        }
                    }

                    // worker is gone, remove worker task status announcements path.
                    String workerStatusPath = JOINER.join(indexerZkConfig.getStatusPath(), worker);
                    if (cf.checkExists().forPath(workerStatusPath) != null) {
                        cf.delete().guaranteed().forPath(JOINER.join(indexerZkConfig.getStatusPath(), worker));
                    }
                } catch (Exception e) {
                    log.makeAlert("Exception while cleaning up worker[%s]", worker).emit();
                    throw Throwables.propagate(e);
                } finally {
                    removedWorkerCleanups.remove(worker);
                }
            }
        }, config.getTaskCleanupTimeout().toStandardDuration().getMillis(), TimeUnit.MILLISECONDS));
    }

    private void taskComplete(RemoteTaskRunnerWorkItem taskRunnerWorkItem, ZkWorker zkWorker,
            TaskStatus taskStatus) {
        Preconditions.checkNotNull(taskRunnerWorkItem, "taskRunnerWorkItem");
        Preconditions.checkNotNull(taskStatus, "taskStatus");
        if (zkWorker != null) {
            log.info("Worker[%s] completed task[%s] with status[%s]", zkWorker.getWorker().getHost(),
                    taskStatus.getId(), taskStatus.getStatusCode());
            // Worker is done with this task
            zkWorker.setLastCompletedTaskTime(new DateTime());
        } else {
            log.info("Workerless task[%s] completed with status[%s]", taskStatus.getId(),
                    taskStatus.getStatusCode());
        }

        // Move from running -> complete
        completeTasks.put(taskStatus.getId(), taskRunnerWorkItem);
        runningTasks.remove(taskStatus.getId());

        // Notify interested parties
        taskRunnerWorkItem.setResult(taskStatus);
    }

    public List<ZkWorker> markWorkersLazy(Predicate<ZkWorker> isLazyWorker, int maxWorkers) {
        // status lock is used to prevent any tasks being assigned to the worker while we mark it lazy
        synchronized (statusLock) {
            Iterator<String> iterator = zkWorkers.keySet().iterator();
            while (iterator.hasNext()) {
                String worker = iterator.next();
                ZkWorker zkWorker = zkWorkers.get(worker);
                try {
                    if (getAssignedTasks(zkWorker.getWorker()).isEmpty() && isLazyWorker.apply(zkWorker)) {
                        log.info("Adding Worker[%s] to lazySet!", zkWorker.getWorker().getHost());
                        lazyWorkers.put(worker, zkWorker);
                        if (lazyWorkers.size() == maxWorkers) {
                            // only mark excess workers as lazy and allow their cleanup
                            break;
                        }
                    }
                } catch (Exception e) {
                    throw Throwables.propagate(e);
                }
            }
            return ImmutableList.copyOf(lazyWorkers.values());
        }
    }

    private List<String> getAssignedTasks(Worker worker) throws Exception {
        List<String> assignedTasks = Lists.newArrayList(
                cf.getChildren().forPath(JOINER.join(indexerZkConfig.getTasksPath(), worker.getHost())));

        for (Map.Entry<String, RemoteTaskRunnerWorkItem> entry : runningTasks.entrySet()) {
            if (entry.getValue() == null) {
                log.error("Huh? null work item for [%s]", entry.getKey());
            } else if (entry.getValue().getWorker() == null) {
                log.error("Huh? no worker for [%s]", entry.getKey());
            } else if (entry.getValue().getWorker().getHost().equalsIgnoreCase(worker.getHost())) {
                log.info("[%s]: Found [%s] running", worker.getHost(), entry.getKey());
                assignedTasks.add(entry.getKey());
            }
        }
        log.info("[%s]: Found %d tasks assigned", worker.getHost(), assignedTasks.size());
        return assignedTasks;
    }

    // Used for tests
    public List<ZkWorker> getLazyWorkers() {
        return ImmutableList.copyOf(lazyWorkers.values());
    }

    ConcurrentMap<String, ScheduledFuture> getRemovedWorkerCleanups() {
        return removedWorkerCleanups;
    }
}