org.apache.druid.indexing.common.task.batch.parallel.TaskMonitor.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.druid.indexing.common.task.batch.parallel.TaskMonitor.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.druid.indexing.common.task.batch.parallel;

import com.google.common.base.Preconditions;
import com.google.common.util.concurrent.Futures;
import com.google.common.util.concurrent.ListenableFuture;
import com.google.common.util.concurrent.SettableFuture;
import org.apache.druid.client.indexing.IndexingServiceClient;
import org.apache.druid.client.indexing.TaskStatusResponse;
import org.apache.druid.indexer.TaskState;
import org.apache.druid.indexer.TaskStatusPlus;
import org.apache.druid.indexing.common.task.Task;
import org.apache.druid.java.util.common.ISE;
import org.apache.druid.java.util.common.concurrent.Execs;
import org.apache.druid.java.util.common.logger.Logger;

import javax.annotation.Nullable;
import java.util.Iterator;
import java.util.List;
import java.util.Map.Entry;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.CopyOnWriteArrayList;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.stream.Collectors;

/**
 * Responsible for submitting tasks, monitoring task statuses, resubmitting failed tasks, and returning the final task
 * status.
 */
public class TaskMonitor<T extends Task> {
    private static final Logger log = new Logger(TaskMonitor.class);

    private final ScheduledExecutorService taskStatusChecker = Execs.scheduledSingleThreaded(("task-monitor-%d"));

    /**
     * A map of subTaskSpecId to {@link MonitorEntry}. This map stores the state of running {@link SubTaskSpec}s. This is
     * read in {@link java.util.concurrent.Callable} executed by {@link #taskStatusChecker} and updated in {@link #submit}
     * and {@link #retry}. This can also be read by calling {@link #getRunningTaskMonitorEntry},
     * {@link #getRunningTaskIds}, and {@link #getRunningSubTaskSpecs}.
     */
    private final ConcurrentMap<String, MonitorEntry> runningTasks = new ConcurrentHashMap<>();

    /**
     * A map of subTaskSpecId to {@link TaskHistory}. This map stores the history of complete {@link SubTaskSpec}s
     * whether their final state is succeeded or failed. This is updated in {@link MonitorEntry#setLastStatus} which is
     * called by the {@link java.util.concurrent.Callable} executed by {@link #taskStatusChecker} and can be
     * read by outside of this class.
     */
    private final ConcurrentMap<String, TaskHistory<T>> taskHistories = new ConcurrentHashMap<>();

    // lock for updating numRunningTasks, numSucceededTasks, and numFailedTasks
    private final Object taskCountLock = new Object();

    // lock for updating running state
    private final Object startStopLock = new Object();

    // overlord client
    private final IndexingServiceClient indexingServiceClient;
    private final int maxRetry;
    private final int expectedNumSucceededTasks;

    private int numRunningTasks;
    private int numSucceededTasks;
    private int numFailedTasks;

    private boolean running = false;

    TaskMonitor(IndexingServiceClient indexingServiceClient, int maxRetry, int expectedNumSucceededTasks) {
        this.indexingServiceClient = Preconditions.checkNotNull(indexingServiceClient, "indexingServiceClient");
        this.maxRetry = maxRetry;
        this.expectedNumSucceededTasks = expectedNumSucceededTasks;

        log.info("TaskMonitor is initialized with expectedNumSucceededTasks[%d]", expectedNumSucceededTasks);
    }

    public void start(long taskStatusCheckingPeriod) {
        synchronized (startStopLock) {
            running = true;
            log.info("Starting taskMonitor");
            // NOTE: This polling can be improved to event-driven pushing by registering TaskRunnerListener to TaskRunner.
            // That listener should be able to send the events reported to TaskRunner to this TaskMonitor.
            taskStatusChecker.scheduleAtFixedRate(() -> {
                try {
                    final Iterator<Entry<String, MonitorEntry>> iterator = runningTasks.entrySet().iterator();
                    while (iterator.hasNext()) {
                        final Entry<String, MonitorEntry> entry = iterator.next();
                        final String specId = entry.getKey();
                        final MonitorEntry monitorEntry = entry.getValue();
                        final String taskId = monitorEntry.runningTask.getId();
                        final TaskStatusResponse taskStatusResponse = indexingServiceClient.getTaskStatus(taskId);
                        final TaskStatusPlus taskStatus = taskStatusResponse.getStatus();
                        if (taskStatus != null) {
                            switch (Preconditions.checkNotNull(taskStatus.getStatusCode(), "taskState")) {
                            case SUCCESS:
                                incrementNumSucceededTasks();

                                // Remote the current entry after updating taskHistories to make sure that task history
                                // exists either runningTasks or taskHistories.
                                monitorEntry.setLastStatus(taskStatus);
                                iterator.remove();
                                break;
                            case FAILED:
                                incrementNumFailedTasks();

                                log.warn("task[%s] failed!", taskId);
                                if (monitorEntry.numTries() < maxRetry) {
                                    log.info("We still have chances[%d/%d] to complete for spec[%s].",
                                            monitorEntry.numTries(), maxRetry, monitorEntry.spec.getId());
                                    retry(specId, monitorEntry, taskStatus);
                                } else {
                                    log.error("spec[%s] failed after [%d] tries", monitorEntry.spec.getId(),
                                            monitorEntry.numTries());
                                    // Remote the current entry after updating taskHistories to make sure that task history
                                    // exists either runningTasks or taskHistories.
                                    monitorEntry.setLastStatus(taskStatus);
                                    iterator.remove();
                                }
                                break;
                            case RUNNING:
                                monitorEntry.updateStatus(taskStatus);
                                break;
                            default:
                                throw new ISE("Unknown taskStatus[%s] for task[%s[", taskStatus.getStatusCode(),
                                        taskId);
                            }
                        }
                    }
                } catch (Throwable t) {
                    log.error(t, "Error while monitoring");
                    throw t;
                }
            }, taskStatusCheckingPeriod, taskStatusCheckingPeriod, TimeUnit.MILLISECONDS);
        }
    }

    public void stop() {
        synchronized (startStopLock) {
            running = false;
            taskStatusChecker.shutdownNow();
            log.info("Stopped taskMonitor");
        }
    }

    public ListenableFuture<SubTaskCompleteEvent<T>> submit(SubTaskSpec<T> spec) {
        synchronized (startStopLock) {
            if (!running) {
                return Futures.immediateFailedFuture(new ISE("TaskMonitore is not running"));
            }
            final T task = spec.newSubTask(0);
            log.info("Submitting a new task[%s] for spec[%s]", task.getId(), spec.getId());
            indexingServiceClient.runTask(task);
            incrementNumRunningTasks();

            final SettableFuture<SubTaskCompleteEvent<T>> taskFuture = SettableFuture.create();
            runningTasks.put(spec.getId(), new MonitorEntry(spec, task,
                    indexingServiceClient.getTaskStatus(task.getId()).getStatus(), taskFuture));

            return taskFuture;
        }
    }

    /**
     * Submit a retry task for a failed spec. This method should be called inside of the
     * {@link java.util.concurrent.Callable} executed by {@link #taskStatusChecker}.
     */
    private void retry(String subTaskSpecId, MonitorEntry monitorEntry, TaskStatusPlus lastFailedTaskStatus) {
        synchronized (startStopLock) {
            if (running) {
                final SubTaskSpec<T> spec = monitorEntry.spec;
                final T task = spec.newSubTask(monitorEntry.taskHistory.size() + 1);
                log.info("Submitting a new task[%s] for retrying spec[%s]", task.getId(), spec.getId());
                indexingServiceClient.runTask(task);
                incrementNumRunningTasks();

                runningTasks.put(subTaskSpecId, monitorEntry.withNewRunningTask(task,
                        indexingServiceClient.getTaskStatus(task.getId()).getStatus(), lastFailedTaskStatus));
            }
        }
    }

    /**
     * This method should be called after {@link #stop()} to make sure no additional tasks are submitted.
     */
    void killAll() {
        runningTasks.values().forEach(entry -> {
            final String taskId = entry.runningTask.getId();
            log.info("Request to kill subtask[%s]", taskId);
            indexingServiceClient.killTask(taskId);
        });
        runningTasks.clear();
    }

    void incrementNumRunningTasks() {
        synchronized (taskCountLock) {
            numRunningTasks++;
        }
    }

    void incrementNumSucceededTasks() {
        synchronized (taskCountLock) {
            numRunningTasks--;
            numSucceededTasks++;
            log.info("[%d/%d] tasks succeeded", numSucceededTasks, expectedNumSucceededTasks);
        }
    }

    void incrementNumFailedTasks() {
        synchronized (taskCountLock) {
            numRunningTasks--;
            numFailedTasks++;
        }
    }

    boolean isSucceeded() {
        synchronized (taskCountLock) {
            return numSucceededTasks == expectedNumSucceededTasks;
        }
    }

    int getNumRunningTasks() {
        synchronized (taskCountLock) {
            return numRunningTasks;
        }
    }

    SinglePhaseParallelIndexingProgress getProgress() {
        synchronized (taskCountLock) {
            return new SinglePhaseParallelIndexingProgress(numRunningTasks, numSucceededTasks, numFailedTasks,
                    numSucceededTasks + numFailedTasks, numRunningTasks + numSucceededTasks + numFailedTasks,
                    expectedNumSucceededTasks);
        }
    }

    Set<String> getRunningTaskIds() {
        return runningTasks.values().stream().map(entry -> entry.runningTask.getId()).collect(Collectors.toSet());
    }

    List<SubTaskSpec<T>> getRunningSubTaskSpecs() {
        return runningTasks.values().stream().map(monitorEntry -> monitorEntry.spec).collect(Collectors.toList());
    }

    @Nullable
    MonitorEntry getRunningTaskMonitorEntry(String subTaskSpecId) {
        return runningTasks.values().stream()
                .filter(monitorEntry -> monitorEntry.spec.getId().equals(subTaskSpecId)).findFirst().orElse(null);
    }

    List<SubTaskSpec<T>> getCompleteSubTaskSpecs() {
        return taskHistories.values().stream().map(TaskHistory::getSpec).collect(Collectors.toList());
    }

    @Nullable
    TaskHistory<T> getCompleteSubTaskSpecHistory(String subTaskSpecId) {
        return taskHistories.get(subTaskSpecId);
    }

    class MonitorEntry {
        private final SubTaskSpec<T> spec;
        private final T runningTask;
        // old tasks to recent tasks. running task is not included
        private final CopyOnWriteArrayList<TaskStatusPlus> taskHistory;
        private final SettableFuture<SubTaskCompleteEvent<T>> completeEventFuture;

        /**
         * This variable is updated inside of the {@link java.util.concurrent.Callable} executed by
         * {@link #taskStatusChecker}, and can be read by calling {@link #getRunningStatus}.
         */
        @Nullable
        private volatile TaskStatusPlus runningStatus;

        MonitorEntry(SubTaskSpec<T> spec, T runningTask, @Nullable TaskStatusPlus runningStatus,
                SettableFuture<SubTaskCompleteEvent<T>> completeEventFuture) {
            this(spec, runningTask, runningStatus, new CopyOnWriteArrayList<>(), completeEventFuture);
        }

        private MonitorEntry(SubTaskSpec<T> spec, T runningTask, @Nullable TaskStatusPlus runningStatus,
                CopyOnWriteArrayList<TaskStatusPlus> taskHistory,
                SettableFuture<SubTaskCompleteEvent<T>> completeEventFuture) {
            this.spec = spec;
            this.runningTask = runningTask;
            this.runningStatus = runningStatus;
            this.taskHistory = taskHistory;
            this.completeEventFuture = completeEventFuture;
        }

        MonitorEntry withNewRunningTask(T newTask, @Nullable TaskStatusPlus newStatus,
                TaskStatusPlus statusOfLastTask) {
            taskHistory.add(statusOfLastTask);
            return new MonitorEntry(spec, newTask, newStatus, taskHistory, completeEventFuture);
        }

        int numTries() {
            return taskHistory.size() + 1; // count runningTask as well
        }

        void updateStatus(TaskStatusPlus statusPlus) {
            if (!runningTask.getId().equals(statusPlus.getId())) {
                throw new ISE("Task id[%s] of lastStatus is different from the running task[%s]",
                        statusPlus.getId(), runningTask.getId());
            }
            this.runningStatus = statusPlus;
        }

        void setLastStatus(TaskStatusPlus lastStatus) {
            if (!runningTask.getId().equals(lastStatus.getId())) {
                throw new ISE("Task id[%s] of lastStatus is different from the running task[%s]",
                        lastStatus.getId(), runningTask.getId());
            }

            this.runningStatus = lastStatus;
            taskHistory.add(lastStatus);
            taskHistories.put(spec.getId(), new TaskHistory<>(spec, taskHistory));
            completeEventFuture.set(SubTaskCompleteEvent.success(spec, lastStatus));
        }

        SubTaskSpec<T> getSpec() {
            return spec;
        }

        @Nullable
        TaskStatusPlus getRunningStatus() {
            return runningStatus;
        }

        List<TaskStatusPlus> getTaskHistory() {
            return taskHistory;
        }
    }

    static class SubTaskCompleteEvent<T extends Task> {
        private final SubTaskSpec<T> spec;
        @Nullable
        private final TaskStatusPlus lastStatus;
        @Nullable
        private final Throwable throwable;

        static <T extends Task> SubTaskCompleteEvent<T> success(SubTaskSpec<T> spec, TaskStatusPlus lastStatus) {
            return new SubTaskCompleteEvent<>(spec, Preconditions.checkNotNull(lastStatus, "lastStatus"), null);
        }

        static <T extends Task> SubTaskCompleteEvent<T> fail(SubTaskSpec<T> spec, Throwable t) {
            return new SubTaskCompleteEvent<>(spec, null, t);
        }

        private SubTaskCompleteEvent(SubTaskSpec<T> spec, @Nullable TaskStatusPlus lastStatus,
                @Nullable Throwable throwable) {
            this.spec = Preconditions.checkNotNull(spec, "spec");
            this.lastStatus = lastStatus;
            this.throwable = throwable;
        }

        SubTaskSpec<T> getSpec() {
            return spec;
        }

        TaskState getLastState() {
            return lastStatus == null ? TaskState.FAILED : lastStatus.getStatusCode();
        }

        @Nullable
        TaskStatusPlus getLastStatus() {
            return lastStatus;
        }

        @Nullable
        Throwable getThrowable() {
            return throwable;
        }
    }
}