org.apache.flink.runtime.taskmanager.TaskManager.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.flink.runtime.taskmanager.TaskManager.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.runtime.taskmanager;

import java.io.File;
import java.io.IOException;
import java.lang.management.GarbageCollectorMXBean;
import java.lang.management.ManagementFactory;
import java.lang.management.MemoryMXBean;
import java.lang.management.MemoryUsage;
import java.net.InetAddress;
import java.net.InetSocketAddress;
import java.net.NetworkInterface;
import java.net.ServerSocket;
import java.net.Socket;
import java.net.SocketAddress;
import java.net.UnknownHostException;
import java.util.Collections;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.FutureTask;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.flink.api.common.cache.DistributedCache;
import org.apache.flink.api.common.cache.DistributedCache.DistributedCacheEntry;
import org.apache.flink.configuration.ConfigConstants;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.configuration.GlobalConfiguration;
import org.apache.flink.core.fs.Path;
import org.apache.flink.core.protocols.VersionedProtocol;
import org.apache.flink.runtime.ExecutionMode;
import org.apache.flink.runtime.blob.BlobCache;
import org.apache.flink.runtime.broadcast.BroadcastVariableManager;
import org.apache.flink.runtime.deployment.TaskDeploymentDescriptor;
import org.apache.flink.runtime.execution.CancelTaskException;
import org.apache.flink.runtime.execution.ExecutionState;
import org.apache.flink.runtime.execution.RuntimeEnvironment;
import org.apache.flink.runtime.execution.librarycache.BlobLibraryCacheManager;
import org.apache.flink.runtime.execution.librarycache.FallbackLibraryCacheManager;
import org.apache.flink.runtime.execution.librarycache.LibraryCacheManager;
import org.apache.flink.runtime.executiongraph.ExecutionAttemptID;
import org.apache.flink.runtime.filecache.FileCache;
import org.apache.flink.runtime.instance.Hardware;
import org.apache.flink.runtime.instance.HardwareDescription;
import org.apache.flink.runtime.instance.InstanceConnectionInfo;
import org.apache.flink.runtime.instance.InstanceID;
import org.apache.flink.runtime.io.disk.iomanager.IOManager;
import org.apache.flink.runtime.io.disk.iomanager.IOManagerAsync;
import org.apache.flink.runtime.io.network.ChannelManager;
import org.apache.flink.runtime.io.network.LocalConnectionManager;
import org.apache.flink.runtime.io.network.NetworkConnectionManager;
import org.apache.flink.runtime.io.network.netty.NettyConnectionManager;
import org.apache.flink.runtime.ipc.RPC;
import org.apache.flink.runtime.ipc.Server;
import org.apache.flink.runtime.jobgraph.JobID;
import org.apache.flink.runtime.jobgraph.JobVertexID;
import org.apache.flink.runtime.jobgraph.tasks.InputSplitProvider;
import org.apache.flink.runtime.memorymanager.DefaultMemoryManager;
import org.apache.flink.runtime.memorymanager.MemoryManager;
import org.apache.flink.runtime.net.NetUtils;
import org.apache.flink.runtime.profiling.ProfilingUtils;
import org.apache.flink.runtime.profiling.TaskManagerProfiler;
import org.apache.flink.runtime.protocols.AccumulatorProtocol;
import org.apache.flink.runtime.protocols.ChannelLookupProtocol;
import org.apache.flink.runtime.protocols.InputSplitProviderProtocol;
import org.apache.flink.runtime.protocols.JobManagerProtocol;
import org.apache.flink.runtime.protocols.TaskOperationProtocol;
import org.apache.flink.runtime.util.EnvironmentInformation;
import org.apache.flink.runtime.util.ExecutorThreadFactory;
import org.apache.flink.util.ExceptionUtils;
import org.apache.flink.util.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkNotNull;

/**
 * A task manager receives tasks from the job manager and executes them. After having executed them
 * (or in case of an execution error) it reports the execution result back to the job manager.
 * Task managers are able to automatically discover the job manager and receive its configuration from it
 * as long as the job manager is running on the same local network
 */
public class TaskManager implements TaskOperationProtocol {

    private static final Logger LOG = LoggerFactory.getLogger(TaskManager.class);

    private static final int STARTUP_FAILURE_RETURN_CODE = 1;

    private static final int MAX_LOST_HEART_BEATS = 3;

    private static final int DELAY_AFTER_LOST_CONNECTION = 10000;

    public final static String ARG_CONF_DIR = "tempDir";

    // --------------------------------------------------------------------------------------------

    private final ExecutorService executorService = Executors.newFixedThreadPool(2 * Hardware.getNumberCPUCores(),
            ExecutorThreadFactory.INSTANCE);

    private final InstanceConnectionInfo localInstanceConnectionInfo;

    private final HardwareDescription hardwareDescription;

    private final ExecutionMode executionMode;

    private final JobManagerProtocol jobManager;

    private final InputSplitProviderProtocol globalInputSplitProvider;

    private final ChannelLookupProtocol lookupService;

    private final AccumulatorProtocol accumulatorProtocolProxy;

    private final LibraryCacheManager libraryCacheManager;

    private final BroadcastVariableManager bcVarManager = new BroadcastVariableManager();

    private final Server taskManagerServer;

    private final FileCache fileCache = new FileCache();

    /** All currently running tasks */
    private final ConcurrentHashMap<ExecutionAttemptID, Task> runningTasks = new ConcurrentHashMap<ExecutionAttemptID, Task>();

    /** The {@link ChannelManager} sets up and cleans up the data exchange channels of the tasks. */
    private final ChannelManager channelManager;

    /** Instance of the task manager profile if profiling is enabled. */
    private final TaskManagerProfiler profiler;

    private final MemoryManager memoryManager;

    private final IOManager ioManager;

    private final int numberOfSlots;

    private final Thread heartbeatThread;

    private final AtomicBoolean shutdownStarted = new AtomicBoolean(false);

    private volatile InstanceID registeredId;

    /** Stores whether the task manager has already been shut down. */
    private volatile boolean shutdownComplete;

    // --------------------------------------------------------------------------------------------
    //  Constructor & Shutdown
    // --------------------------------------------------------------------------------------------

    public TaskManager(ExecutionMode executionMode, JobManagerProtocol jobManager,
            InputSplitProviderProtocol splitProvider, ChannelLookupProtocol channelLookup,
            AccumulatorProtocol accumulators, InetSocketAddress jobManagerAddress,
            InetAddress taskManagerBindAddress) throws Exception {
        if (executionMode == null || jobManager == null || splitProvider == null || channelLookup == null
                || accumulators == null) {
            throw new NullPointerException();
        }

        LOG.info("TaskManager execution mode: " + executionMode);

        this.executionMode = executionMode;
        this.jobManager = jobManager;
        this.lookupService = channelLookup;
        this.globalInputSplitProvider = splitProvider;
        this.accumulatorProtocolProxy = accumulators;

        // initialize the number of slots
        {
            int slots = GlobalConfiguration.getInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, -1);
            if (slots == -1) {
                slots = 1;
                LOG.info("Number of task slots not configured. Creating one task slot.");
            } else if (slots <= 0) {
                throw new Exception("Illegal value for the number of task slots: " + slots);
            } else {
                LOG.info("Creating " + slots + " task slot(s).");
            }
            this.numberOfSlots = slots;
        }

        int ipcPort = GlobalConfiguration.getInteger(ConfigConstants.TASK_MANAGER_IPC_PORT_KEY, -1);
        int dataPort = GlobalConfiguration.getInteger(ConfigConstants.TASK_MANAGER_DATA_PORT_KEY, -1);
        if (ipcPort == -1) {
            ipcPort = getAvailablePort();
        }
        if (dataPort == -1) {
            dataPort = getAvailablePort();
        }

        this.localInstanceConnectionInfo = new InstanceConnectionInfo(taskManagerBindAddress, ipcPort, dataPort);
        LOG.info("TaskManager connection information:" + this.localInstanceConnectionInfo);

        // Start local RPC server, give it the number of threads as we have slots
        try {
            // some magic number for the handler threads
            final int numHandlers = Math.min(numberOfSlots, 2 * Hardware.getNumberCPUCores());

            this.taskManagerServer = RPC.getServer(this, taskManagerBindAddress.getHostAddress(), ipcPort,
                    numHandlers);
            this.taskManagerServer.start();
        } catch (IOException e) {
            LOG.error("Failed to start TaskManager server. " + e.getMessage(), e);
            throw new Exception("Failed to start taskmanager server. " + e.getMessage(), e);
        }

        // Load profiler if it should be used
        if (GlobalConfiguration.getBoolean(ProfilingUtils.ENABLE_PROFILING_KEY, false)) {

            final String profilerClassName = GlobalConfiguration.getString(ProfilingUtils.TASKMANAGER_CLASSNAME_KEY,
                    "org.apache.flink.runtime.profiling.impl.TaskManagerProfilerImpl");

            this.profiler = ProfilingUtils.loadTaskManagerProfiler(profilerClassName,
                    jobManagerAddress.getAddress(), this.localInstanceConnectionInfo);

            if (this.profiler == null) {
                LOG.error("Cannot find class name for the profiler.");
            } else {
                LOG.info("Profiling of jobs is enabled.");
            }
        } else {
            this.profiler = null;
            LOG.info("Profiling of jobs is disabled.");
        }

        // Get the directory for storing temporary files
        final String[] tmpDirPaths = GlobalConfiguration
                .getString(ConfigConstants.TASK_MANAGER_TMP_DIR_KEY, ConfigConstants.DEFAULT_TASK_MANAGER_TMP_PATH)
                .split(",|" + File.pathSeparator);

        checkTempDirs(tmpDirPaths);

        int numBuffers = GlobalConfiguration.getInteger(ConfigConstants.TASK_MANAGER_NETWORK_NUM_BUFFERS_KEY,
                ConfigConstants.DEFAULT_TASK_MANAGER_NETWORK_NUM_BUFFERS);

        int bufferSize = GlobalConfiguration.getInteger(ConfigConstants.TASK_MANAGER_NETWORK_BUFFER_SIZE_KEY,
                ConfigConstants.DEFAULT_TASK_MANAGER_NETWORK_BUFFER_SIZE);

        // Initialize the channel manager
        try {
            NetworkConnectionManager networkConnectionManager = null;

            switch (executionMode) {
            case LOCAL:
                networkConnectionManager = new LocalConnectionManager();
                break;
            case CLUSTER:
                int numInThreads = GlobalConfiguration.getInteger(
                        ConfigConstants.TASK_MANAGER_NET_NUM_IN_THREADS_KEY,
                        ConfigConstants.DEFAULT_TASK_MANAGER_NET_NUM_IN_THREADS);

                int numOutThreads = GlobalConfiguration.getInteger(
                        ConfigConstants.TASK_MANAGER_NET_NUM_OUT_THREADS_KEY,
                        ConfigConstants.DEFAULT_TASK_MANAGER_NET_NUM_OUT_THREADS);

                int lowWaterMark = GlobalConfiguration.getInteger(
                        ConfigConstants.TASK_MANAGER_NET_NETTY_LOW_WATER_MARK,
                        ConfigConstants.DEFAULT_TASK_MANAGER_NET_NETTY_LOW_WATER_MARK);

                int highWaterMark = GlobalConfiguration.getInteger(
                        ConfigConstants.TASK_MANAGER_NET_NETTY_HIGH_WATER_MARK,
                        ConfigConstants.DEFAULT_TASK_MANAGER_NET_NETTY_HIGH_WATER_MARK);

                networkConnectionManager = new NettyConnectionManager(localInstanceConnectionInfo.address(),
                        localInstanceConnectionInfo.dataPort(), bufferSize, numInThreads, numOutThreads,
                        lowWaterMark, highWaterMark);
                break;
            }

            channelManager = new ChannelManager(lookupService, localInstanceConnectionInfo, numBuffers, bufferSize,
                    networkConnectionManager);
        } catch (IOException ioe) {
            LOG.error(StringUtils.stringifyException(ioe));
            throw new Exception("Failed to instantiate ChannelManager.", ioe);
        }

        // initialize the memory manager
        {
            // Check whether the memory size has been explicitly configured.
            final long configuredMemorySize = GlobalConfiguration
                    .getInteger(ConfigConstants.TASK_MANAGER_MEMORY_SIZE_KEY, -1);
            final long memorySize;

            if (configuredMemorySize == -1) {
                // no manually configured memory. take a relative fraction of the free heap space
                float fraction = GlobalConfiguration.getFloat(ConfigConstants.TASK_MANAGER_MEMORY_FRACTION_KEY,
                        ConfigConstants.DEFAULT_MEMORY_MANAGER_MEMORY_FRACTION);
                memorySize = (long) (EnvironmentInformation.getSizeOfFreeHeapMemoryWithDefrag() * fraction);
                LOG.info("Using " + fraction + " of the free heap space for managed memory.");
            } else if (configuredMemorySize <= 0) {
                throw new Exception("Invalid value for Memory Manager memory size: " + configuredMemorySize);
            } else {
                memorySize = configuredMemorySize << 20;
            }

            final int pageSize = GlobalConfiguration.getInteger(
                    ConfigConstants.TASK_MANAGER_NETWORK_BUFFER_SIZE_KEY,
                    ConfigConstants.DEFAULT_TASK_MANAGER_NETWORK_BUFFER_SIZE);

            // Initialize the memory manager
            LOG.info("Initializing memory manager with " + (memorySize >>> 20) + " megabytes of memory. "
                    + "Page size is " + pageSize + " bytes.");

            try {
                @SuppressWarnings("unused")
                final boolean lazyAllocation = GlobalConfiguration.getBoolean(
                        ConfigConstants.TASK_MANAGER_MEMORY_LAZY_ALLOCATION_KEY,
                        ConfigConstants.DEFAULT_TASK_MANAGER_MEMORY_LAZY_ALLOCATION);

                this.memoryManager = new DefaultMemoryManager(memorySize, this.numberOfSlots, pageSize);
            } catch (Throwable t) {
                LOG.error(
                        "Unable to initialize memory manager with " + (memorySize >>> 20) + " megabytes of memory.",
                        t);
                throw new Exception("Unable to initialize memory manager.", t);
            }
        }

        this.hardwareDescription = HardwareDescription.extractFromSystem(this.memoryManager.getMemorySize());

        // Determine the port of the BLOB server and register it with the library cache manager
        {
            final int blobPort = this.jobManager.getBlobServerPort();

            if (blobPort == -1) {
                LOG.warn("Unable to determine BLOB server address: User library download will not be available");
                this.libraryCacheManager = new FallbackLibraryCacheManager();
            } else {
                final InetSocketAddress blobServerAddress = new InetSocketAddress(jobManagerAddress.getAddress(),
                        blobPort);
                LOG.info("Determined BLOB server address to be " + blobServerAddress);

                this.libraryCacheManager = new BlobLibraryCacheManager(new BlobCache(blobServerAddress),
                        GlobalConfiguration.getConfiguration());
            }
        }
        this.ioManager = new IOManagerAsync(tmpDirPaths);

        // start the heart beats
        {
            final long interval = GlobalConfiguration.getInteger(
                    ConfigConstants.TASK_MANAGER_HEARTBEAT_INTERVAL_KEY,
                    ConfigConstants.DEFAULT_TASK_MANAGER_HEARTBEAT_INTERVAL);

            this.heartbeatThread = new Thread() {
                @Override
                public void run() {
                    registerAndRunHeartbeatLoop(interval, MAX_LOST_HEART_BEATS);
                }
            };
            this.heartbeatThread.setName("Heartbeat Thread");
            this.heartbeatThread.start();
        }

        // --------------------------------------------------------------------
        // Memory Usage
        // --------------------------------------------------------------------

        final MemoryMXBean memoryMXBean = ManagementFactory.getMemoryMXBean();
        final List<GarbageCollectorMXBean> gcMXBeans = ManagementFactory.getGarbageCollectorMXBeans();

        LOG.info(getMemoryUsageStatsAsString(memoryMXBean));

        boolean startMemoryUsageLogThread = GlobalConfiguration.getBoolean(
                ConfigConstants.TASK_MANAGER_DEBUG_MEMORY_USAGE_START_LOG_THREAD,
                ConfigConstants.DEFAULT_TASK_MANAGER_DEBUG_MEMORY_USAGE_START_LOG_THREAD);

        if (startMemoryUsageLogThread) {
            final int logIntervalMs = GlobalConfiguration.getInteger(
                    ConfigConstants.TASK_MANAGER_DEBUG_MEMORY_USAGE_LOG_INTERVAL_MS,
                    ConfigConstants.DEFAULT_TASK_MANAGER_DEBUG_MEMORY_USAGE_LOG_INTERVAL_MS);

            new Thread(new Runnable() {
                @Override
                public void run() {
                    try {
                        while (!isShutDown()) {
                            Thread.sleep(logIntervalMs);

                            LOG.info(getMemoryUsageStatsAsString(memoryMXBean));
                            LOG.info(getGarbageCollectorStatsAsString(gcMXBeans));
                        }
                    } catch (InterruptedException e) {
                        LOG.warn("Unexpected interruption of memory usage logger thread.");
                    }
                }
            }).start();
        }
    }

    /**
     * Shuts the task manager down.
     */
    public void shutdown() {
        if (!this.shutdownStarted.compareAndSet(false, true)) {
            return;
        }

        LOG.info("Shutting down TaskManager");

        cancelAndClearEverything(new Exception("Task Manager is shutting down"));

        // first, stop the heartbeat thread and wait for it to terminate
        this.heartbeatThread.interrupt();
        try {
            this.heartbeatThread.join(1000);
        } catch (InterruptedException e) {
        }

        this.registeredId = null;

        // Stop RPC proxy for the task manager
        stopProxy(this.jobManager);

        // Stop RPC proxy for the global input split assigner
        stopProxy(this.globalInputSplitProvider);

        // Stop RPC proxy for the lookup service
        stopProxy(this.lookupService);

        // Stop RPC proxy for accumulator reports
        stopProxy(this.accumulatorProtocolProxy);

        // Shut down the own RPC server
        try {
            this.taskManagerServer.stop();
        } catch (Throwable t) {
            LOG.warn("TaskManager RPC server did not shut down properly.", t);
        }

        // Stop profiling if enabled
        if (this.profiler != null) {
            this.profiler.shutdown();
        }

        // Shut down the channel manager
        try {
            this.channelManager.shutdown();
        } catch (Throwable t) {
            LOG.warn("ChannelManager did not shutdown properly: " + t.getMessage(), t);
        }

        // Shut down the memory manager
        if (this.ioManager != null) {
            this.ioManager.shutdown();
        }

        if (this.memoryManager != null) {
            this.memoryManager.shutdown();
        }

        if (libraryCacheManager != null) {
            try {
                this.libraryCacheManager.shutdown();
            } catch (IOException e) {
                LOG.warn("Could not properly shutdown the library cache manager.", e);
            }
        }

        this.fileCache.shutdown();

        // Shut down the executor service
        if (this.executorService != null) {
            this.executorService.shutdown();
            try {
                this.executorService.awaitTermination(5000L, TimeUnit.MILLISECONDS);
            } catch (InterruptedException e) {
                LOG.debug("Shutdown of executor thread pool interrupted", e);
            }
        }

        this.shutdownComplete = true;
    }

    /**
     * Checks whether the task manager has already been shut down.
     * 
     * @return <code>true</code> if the task manager has already been shut down, <code>false</code> otherwise
     */
    public boolean isShutDown() {
        return this.shutdownComplete;
    }

    // --------------------------------------------------------------------------------------------
    //  Properties
    // --------------------------------------------------------------------------------------------

    public InstanceConnectionInfo getConnectionInfo() {
        return this.localInstanceConnectionInfo;
    }

    public ExecutionMode getExecutionMode() {
        return this.executionMode;
    }

    /**
     * Gets the ID under which the TaskManager is currently registered at its JobManager.
     * If the TaskManager has not been registered, yet, or if it lost contact, this is is null.
     * 
     * @return The ID under which the TaskManager is currently registered.
     */
    public InstanceID getRegisteredId() {
        return this.registeredId;
    }

    /**
     * Checks if the TaskManager is properly registered and ready to receive work.
     * 
     * @return True, if the TaskManager is registered, false otherwise.
     */
    public boolean isRegistered() {
        return this.registeredId != null;
    }

    public Map<ExecutionAttemptID, Task> getAllRunningTasks() {
        return Collections.unmodifiableMap(this.runningTasks);
    }

    public ChannelManager getChannelManager() {
        return channelManager;
    }

    public BroadcastVariableManager getBroadcastVariableManager() {
        return this.bcVarManager;
    }

    // --------------------------------------------------------------------------------------------
    //  Task Operation
    // --------------------------------------------------------------------------------------------

    @Override
    public TaskOperationResult cancelTask(ExecutionAttemptID executionId) throws IOException {

        final Task task = this.runningTasks.get(executionId);

        if (task == null) {
            return new TaskOperationResult(executionId, false, "No task with that execution ID was found.");
        }

        // Pass call to executor service so IPC thread can return immediately
        final Runnable r = new Runnable() {
            @Override
            public void run() {
                task.cancelExecution();
            }
        };
        this.executorService.execute(r);

        // return success
        return new TaskOperationResult(executionId, true);
    }

    @Override
    public TaskOperationResult submitTask(TaskDeploymentDescriptor tdd) {
        final JobID jobID = tdd.getJobID();
        final JobVertexID vertexId = tdd.getVertexID();
        final ExecutionAttemptID executionId = tdd.getExecutionId();
        final int taskIndex = tdd.getIndexInSubtaskGroup();
        final int numSubtasks = tdd.getCurrentNumberOfSubtasks();

        Task task = null;

        // check if the taskmanager is shut down or disconnected
        if (shutdownStarted.get()) {
            return new TaskOperationResult(executionId, false, "TaskManager is shut down.");
        }
        if (registeredId == null) {
            return new TaskOperationResult(executionId, false, "TaskManager lost connection to JobManager.");
        }

        try {
            // Now register data with the library manager
            libraryCacheManager.registerTask(jobID, executionId, tdd.getRequiredJarFiles());

            // library and classloader issues first
            final ClassLoader userCodeClassLoader = libraryCacheManager.getClassLoader(jobID);
            if (userCodeClassLoader == null) {
                throw new Exception("No user code ClassLoader available.");
            }

            task = new Task(jobID, vertexId, taskIndex, numSubtasks, executionId, tdd.getTaskName(), this);
            if (this.runningTasks.putIfAbsent(executionId, task) != null) {
                throw new Exception("TaskManager contains already a task with executionId " + executionId);
            }

            final InputSplitProvider splitProvider = new TaskInputSplitProvider(this.globalInputSplitProvider,
                    jobID, vertexId, executionId);
            final RuntimeEnvironment env = new RuntimeEnvironment(task, tdd, userCodeClassLoader,
                    this.memoryManager, this.ioManager, splitProvider, this.accumulatorProtocolProxy,
                    this.bcVarManager);
            task.setEnvironment(env);

            // register the task with the network stack and profilers
            this.channelManager.register(task);

            final Configuration jobConfig = tdd.getJobConfiguration();

            boolean enableProfiling = this.profiler != null
                    && jobConfig.getBoolean(ProfilingUtils.PROFILE_JOB_KEY, true);

            // Register environment, input, and output gates for profiling
            if (enableProfiling) {
                task.registerProfiler(this.profiler, jobConfig);
            }

            // now that the task is successfully created and registered, we can start copying the
            // distributed cache temp files
            Map<String, FutureTask<Path>> cpTasks = new HashMap<String, FutureTask<Path>>();
            for (Entry<String, DistributedCacheEntry> e : DistributedCache
                    .readFileInfoFromConfig(tdd.getJobConfiguration())) {
                FutureTask<Path> cp = this.fileCache.createTmpFile(e.getKey(), e.getValue(), jobID);
                cpTasks.put(e.getKey(), cp);
            }
            env.addCopyTasksForCacheFile(cpTasks);

            if (!task.startExecution()) {
                throw new CancelTaskException();
            }

            // final check that we can go (we do this after the registration, so the the "happen's before"
            // relationship ensures that either the shutdown removes this task, or we are aware of the shutdown
            if (shutdownStarted.get() || this.registeredId == null) {
                throw new Exception("Task Manager is shut down or is not connected to a JobManager.");
            }

            return new TaskOperationResult(executionId, true);
        } catch (Throwable t) {
            String message;
            if (t instanceof CancelTaskException) {
                message = "Task was canceled";
            } else {
                LOG.error("Could not instantiate task", t);
                message = ExceptionUtils.stringifyException(t);
            }

            try {
                try {
                    task.failExternally(t);
                } catch (Throwable t2) {
                    LOG.error("Error during cleanup of task deployment", t2);
                }

                this.runningTasks.remove(executionId);

                if (task != null) {
                    removeAllTaskResources(task);
                }

                libraryCacheManager.unregisterTask(jobID, executionId);
            } catch (Throwable t2) {
                LOG.error("Error during cleanup of task deployment", t2);
            }

            return new TaskOperationResult(executionId, false, message);
        }
    }

    /**
     * Unregisters a finished or aborted task.
     * 
     * @param executionId
     *        the ID of the task to be unregistered
     */
    private void unregisterTask(ExecutionAttemptID executionId) {
        // Task de-registration must be atomic
        final Task task = this.runningTasks.remove(executionId);
        if (task == null) {
            if (LOG.isDebugEnabled()) {
                LOG.debug("Cannot find task with ID " + executionId + " to unregister");
            }
            return;
        }

        removeAllTaskResources(task);

        // Unregister task from library cache manager
        libraryCacheManager.unregisterTask(task.getJobID(), executionId);
    }

    private void removeAllTaskResources(Task task) {
        // Unregister task from the byte buffered channel manager
        this.channelManager.unregister(task.getExecutionId(), task);

        // Unregister task from profiling
        task.unregisterProfiler(this.profiler);

        // Unregister task from memory manager
        task.unregisterMemoryManager(this.memoryManager);

        // remove the local tmp file for unregistered tasks.
        try {
            RuntimeEnvironment re = task.getEnvironment();
            if (re != null) {
                for (Entry<String, DistributedCacheEntry> e : DistributedCache
                        .readFileInfoFromConfig(task.getEnvironment().getJobConfiguration())) {
                    this.fileCache.deleteTmpFile(e.getKey(), e.getValue(), task.getJobID());
                }
            }
        } catch (Throwable t) {
            LOG.error("Error cleaning up local files from the distributed cache.", t);
        }
    }

    public void notifyExecutionStateChange(JobID jobID, ExecutionAttemptID executionId,
            ExecutionState newExecutionState, Throwable optionalError) {

        // Get lock on the jobManager object and propagate the state change
        boolean success = false;
        try {
            success = this.jobManager.updateTaskExecutionState(
                    new TaskExecutionState(jobID, executionId, newExecutionState, optionalError));
        } catch (Throwable t) {
            String msg = "Error sending task state update to JobManager.";
            LOG.error(msg, t);
            ExceptionUtils.rethrow(t, msg);
        } finally {
            // in case of a failure, or when the tasks is in a finished state, then unregister the
            // task (free all buffers, remove all channels, task-specific class loaders, etc...)
            if (!success || newExecutionState == ExecutionState.FINISHED
                    || newExecutionState == ExecutionState.CANCELED || newExecutionState == ExecutionState.FAILED) {
                unregisterTask(executionId);
            }
        }
    }

    /**
     * Removes all tasks from this TaskManager.
     */
    public void cancelAndClearEverything(Throwable cause) {
        if (runningTasks.size() > 0) {
            LOG.info("Cancelling all computations and discarding all cached data.");

            for (Task t : runningTasks.values()) {
                t.failExternally(cause);
                runningTasks.remove(t.getExecutionId());
            }
        }
    }

    // --------------------------------------------------------------------------------------------
    //  Heartbeats
    // --------------------------------------------------------------------------------------------

    /**
     * This method registers the TaskManager at the jobManager and send periodic heartbeats.
     */
    private void registerAndRunHeartbeatLoop(long interval, int maxNonSuccessfulHeatbeats) {

        while (!shutdownStarted.get()) {
            InstanceID resultId = null;

            // try to register. We try as long as we need to, because it may be that the jobmanager is  not yet online
            {
                final long maxDelay = 10000; // the maximal delay between registration attempts
                final long reportingDelay = 5000;
                long currentDelay = 100; // initially, wait 100 msecs for the next registration attempt

                while (!shutdownStarted.get()) {
                    if (LOG.isDebugEnabled()) {
                        LOG.debug("Trying to register at Jobmanager...");
                    }

                    try {
                        resultId = this.jobManager.registerTaskManager(this.localInstanceConnectionInfo,
                                this.hardwareDescription, this.numberOfSlots);

                        if (resultId == null) {
                            throw new Exception("Registration attempt refused by JobManager.");
                        }
                    } catch (Exception e) {
                        // this may be if the job manager was not yet online
                        // if this has happened for a while, report it. if it has just happened
                        // at the very beginning, this may not mean anything (JM still in startup)
                        if (currentDelay >= reportingDelay) {
                            LOG.error("Connection to JobManager failed.", e);
                        } else if (LOG.isDebugEnabled()) {
                            LOG.debug("Could not connect to JobManager.", e);
                        }
                    }

                    // check if we were accepted
                    if (resultId != null) {
                        // success
                        this.registeredId = resultId;
                        break;
                    }

                    try {
                        Thread.sleep(currentDelay);
                    } catch (InterruptedException e) {
                        // may be due to shutdown
                        if (!shutdownStarted.get()) {
                            LOG.error("TaskManager's registration loop was interrupted without shutdown.");
                        }
                    }

                    // increase the time between registration attempts, to not keep on pinging overly frequently
                    currentDelay = Math.min(2 * currentDelay, maxDelay);
                }
            }

            // registration complete, or shutdown
            int successiveUnsuccessfulHeartbeats = 0;

            // the heart beat loop
            while (!shutdownStarted.get()) {
                // sleep until the next heart beat
                try {
                    Thread.sleep(interval);
                } catch (InterruptedException e) {
                    if (!shutdownStarted.get()) {
                        LOG.error("TaskManager heart beat loop was interrupted without shutdown.");
                    }
                }

                // send heart beat
                try {
                    boolean accepted = this.jobManager.sendHeartbeat(resultId);

                    if (accepted) {
                        // reset the unsuccessful heart beats
                        successiveUnsuccessfulHeartbeats = 0;
                    } else {
                        successiveUnsuccessfulHeartbeats++;
                        LOG.error("JobManager rejected heart beat.");
                    }
                } catch (IOException e) {
                    if (!shutdownStarted.get()) {
                        successiveUnsuccessfulHeartbeats++;
                        LOG.error("Sending the heart beat failed on I/O error: " + e.getMessage(), e);
                    }
                }

                if (successiveUnsuccessfulHeartbeats == maxNonSuccessfulHeatbeats) {
                    // we are done for, we cannot connect to the jobmanager any more
                    // or we are not welcome there any more
                    // what to do now? Wait for a while and try to reconnect
                    LOG.error("TaskManager has lost connection to JobManager.");

                    // mark us as disconnected and abort all computation
                    this.registeredId = null;
                    cancelAndClearEverything(new Exception("TaskManager lost heartbeat connection to JobManager"));

                    // wait for a while, then attempt to register again
                    try {
                        Thread.sleep(DELAY_AFTER_LOST_CONNECTION);
                    } catch (InterruptedException e) {
                        if (!shutdownStarted.get()) {
                            LOG.error("TaskManager heart beat loop was interrupted without shutdown.");
                        }
                    }

                    // leave the heart beat loop
                    break;
                }
            } // end heart beat loop
        } // end while not shutdown
    }

    // --------------------------------------------------------------------------------------------
    //  Memory and Garbage Collection Debugging Utilities
    // --------------------------------------------------------------------------------------------

    private String getMemoryUsageStatsAsString(MemoryMXBean memoryMXBean) {
        MemoryUsage heap = memoryMXBean.getHeapMemoryUsage();
        MemoryUsage nonHeap = memoryMXBean.getNonHeapMemoryUsage();

        int mb = 20;

        long heapUsed = heap.getUsed() >> mb;
        long heapCommitted = heap.getCommitted() >> mb;
        long heapMax = heap.getMax() >> mb;

        long nonHeapUsed = nonHeap.getUsed() >> mb;
        long nonHeapCommitted = nonHeap.getCommitted() >> mb;
        long nonHeapMax = nonHeap.getMax() >> mb;

        String msg = String.format(
                "Memory usage stats: [HEAP: %d/%d/%d MB, NON HEAP: %d/%d/%d MB (used/comitted/max)]", heapUsed,
                heapCommitted, heapMax, nonHeapUsed, nonHeapCommitted, nonHeapMax);

        return msg;
    }

    private String getGarbageCollectorStatsAsString(List<GarbageCollectorMXBean> gcMXBeans) {
        StringBuilder str = new StringBuilder();
        str.append("Garbage collector stats: ");

        for (int i = 0; i < gcMXBeans.size(); i++) {
            GarbageCollectorMXBean bean = gcMXBeans.get(i);

            String msg = String.format("[%s, GC TIME (ms): %d, GC COUNT: %d]", bean.getName(),
                    bean.getCollectionTime(), bean.getCollectionCount());
            str.append(msg);
            str.append(i < gcMXBeans.size() - 1 ? ", " : "");
        }

        return str.toString();
    }

    // --------------------------------------------------------------------------------------------
    //  Execution & Initialization
    // --------------------------------------------------------------------------------------------

    public static TaskManager createTaskManager(ExecutionMode mode) throws Exception {

        // IMPORTANT! At this point, the GlobalConfiguration must have been read!

        final InetSocketAddress jobManagerAddress;
        LOG.info("Reading location of job manager from configuration");

        final String address = GlobalConfiguration.getString(ConfigConstants.JOB_MANAGER_IPC_ADDRESS_KEY, null);
        final int port = GlobalConfiguration.getInteger(ConfigConstants.JOB_MANAGER_IPC_PORT_KEY,
                ConfigConstants.DEFAULT_JOB_MANAGER_IPC_PORT);

        if (address == null) {
            throw new Exception("Job manager address not configured in the GlobalConfiguration.");
        }

        // Try to convert configured address to {@link InetAddress}
        try {
            final InetAddress tmpAddress = InetAddress.getByName(address);
            jobManagerAddress = new InetSocketAddress(tmpAddress, port);
        } catch (UnknownHostException e) {
            LOG.error("Could not resolve JobManager host name.");
            throw new Exception("Could not resolve JobManager host name: " + e.getMessage(), e);
        }

        return createTaskManager(mode, jobManagerAddress);
    }

    public static TaskManager createTaskManager(ExecutionMode mode, InetSocketAddress jobManagerAddress)
            throws Exception {
        // Determine our own public facing address and start the server
        final InetAddress taskManagerAddress;
        try {
            taskManagerAddress = getTaskManagerAddress(jobManagerAddress);
        } catch (IOException e) {
            throw new Exception(
                    "The TaskManager failed to determine the IP address of the interface that connects to the JobManager.",
                    e);
        }

        return createTaskManager(mode, jobManagerAddress, taskManagerAddress);
    }

    public static TaskManager createTaskManager(ExecutionMode mode, InetSocketAddress jobManagerAddress,
            InetAddress taskManagerAddress) throws Exception {

        // IMPORTANT! At this point, the GlobalConfiguration must have been read!

        LOG.info("Connecting to JobManager at: " + jobManagerAddress);

        // Create RPC connections to the JobManager

        JobManagerProtocol jobManager = null;
        InputSplitProviderProtocol splitProvider = null;
        ChannelLookupProtocol channelLookup = null;
        AccumulatorProtocol accumulators = null;

        // try/finally block to close proxies if anything goes wrong
        boolean success = false;
        try {
            // create the RPC call proxy to the job manager for jobs
            try {
                jobManager = RPC.getProxy(JobManagerProtocol.class, jobManagerAddress, NetUtils.getSocketFactory());
            } catch (IOException e) {
                LOG.error("Could not connect to the JobManager: " + e.getMessage(), e);
                throw new Exception("Failed to initialize connection to JobManager: " + e.getMessage(), e);
            }

            // Try to create local stub of the global input split provider
            try {
                splitProvider = RPC.getProxy(InputSplitProviderProtocol.class, jobManagerAddress,
                        NetUtils.getSocketFactory());
            } catch (IOException e) {
                LOG.error(e.getMessage(), e);
                throw new Exception(
                        "Failed to initialize connection to global input split provider: " + e.getMessage(), e);
            }

            // Try to create local stub for the lookup service
            try {
                channelLookup = RPC.getProxy(ChannelLookupProtocol.class, jobManagerAddress,
                        NetUtils.getSocketFactory());
            } catch (IOException e) {
                LOG.error(e.getMessage(), e);
                throw new Exception("Failed to initialize channel lookup protocol. " + e.getMessage(), e);
            }

            // Try to create local stub for the accumulators
            try {
                accumulators = RPC.getProxy(AccumulatorProtocol.class, jobManagerAddress,
                        NetUtils.getSocketFactory());
            } catch (IOException e) {
                LOG.error("Failed to initialize accumulator protocol: " + e.getMessage(), e);
                throw new Exception("Failed to initialize accumulator protocol: " + e.getMessage(), e);
            }

            TaskManager tm = new TaskManager(mode, jobManager, splitProvider, channelLookup, accumulators,
                    jobManagerAddress, taskManagerAddress);
            success = true;
            return tm;
        } finally {
            if (!success) {
                stopProxy(jobManager);
                stopProxy(splitProvider);
                stopProxy(channelLookup);
                stopProxy(accumulators);
            }
        }
    }

    // --------------------------------------------------------------------------------------------
    //  Executable
    // --------------------------------------------------------------------------------------------

    /**
     * Entry point for the TaskManager executable.
     * 
     * @param args Arguments from the command line
     * @throws IOException 
     */
    @SuppressWarnings("static-access")
    public static void main(String[] args) throws IOException {
        Option configDirOpt = OptionBuilder.withArgName("config directory").hasArg()
                .withDescription("Specify configuration directory.").create("configDir");
        // tempDir option is used by the YARN client.
        Option tempDir = OptionBuilder.withArgName("temporary directory (overwrites configured option)").hasArg()
                .withDescription("Specify temporary directory.").create(ARG_CONF_DIR);
        configDirOpt.setRequired(true);
        tempDir.setRequired(false);
        Options options = new Options();
        options.addOption(configDirOpt);
        options.addOption(tempDir);

        CommandLineParser parser = new GnuParser();
        CommandLine line = null;
        try {
            line = parser.parse(options, args);
        } catch (ParseException e) {
            System.err.println("CLI Parsing failed. Reason: " + e.getMessage());
            System.exit(STARTUP_FAILURE_RETURN_CODE);
        }

        String configDir = line.getOptionValue(configDirOpt.getOpt(), null);
        String tempDirVal = line.getOptionValue(tempDir.getOpt(), null);

        // First, try to load global configuration
        GlobalConfiguration.loadConfiguration(configDir);
        if (tempDirVal != null // the YARN TM runner has set a value for the temp dir
                // the configuration does not contain a temp directory
                && GlobalConfiguration.getString(ConfigConstants.TASK_MANAGER_TMP_DIR_KEY, null) == null) {
            Configuration c = GlobalConfiguration.getConfiguration();
            c.setString(ConfigConstants.TASK_MANAGER_TMP_DIR_KEY, tempDirVal);
            LOG.info("Setting temporary directory to " + tempDirVal);
            GlobalConfiguration.includeConfiguration(c);
        }

        // print some startup environment info, like user, code revision, etc
        EnvironmentInformation.logEnvironmentInfo(LOG, "TaskManager");

        // Create a new task manager object
        try {
            createTaskManager(ExecutionMode.CLUSTER);
        } catch (Throwable t) {
            LOG.error("Taskmanager startup failed: " + t.getMessage(), t);
            System.exit(STARTUP_FAILURE_RETURN_CODE);
        }

        // park the main thread to keep the JVM alive (all other threads may be daemon threads)
        Object mon = new Object();
        synchronized (mon) {
            try {
                mon.wait();
            } catch (InterruptedException ex) {
            }
        }
    }

    // --------------------------------------------------------------------------------------------
    // Miscellaneous Utilities
    // --------------------------------------------------------------------------------------------

    /**
     * Checks, whether the given strings describe existing directories that are writable. If that is not
     * the case, an exception is raised.
     * 
     * @param tempDirs An array of strings which are checked to be paths to writable directories.
     * @throws Exception Thrown, if any of the mentioned checks fails.
     */
    private static final void checkTempDirs(final String[] tempDirs) throws Exception {
        for (int i = 0; i < tempDirs.length; ++i) {
            final String dir = checkNotNull(tempDirs[i], "Temporary file directory #" + (i + 1) + " is null.");

            final File f = new File(dir);

            checkArgument(f.exists(), "Temporary file directory '" + f.getAbsolutePath() + "' does not exist.");
            checkArgument(f.isDirectory(),
                    "Temporary file directory '" + f.getAbsolutePath() + "' is not a directory.");
            checkArgument(f.canWrite(), "Temporary file directory '" + f.getAbsolutePath() + "' is not writable.");

            if (LOG.isInfoEnabled()) {
                long totalSpaceGb = f.getTotalSpace() >> 30;
                long usableSpaceGb = f.getUsableSpace() >> 30;
                double usablePercentage = ((double) usableSpaceGb) / totalSpaceGb * 100;

                LOG.info(String.format("Temporary file directory '%s': total %d GB, usable %d GB [%.2f%% usable]",
                        f.getAbsolutePath(), totalSpaceGb, usableSpaceGb, usablePercentage));
            }
        }
    }

    /**
     * Stops the given RPC protocol proxy, if it is not null.
     * This method never throws an exception, it only logs errors.
     * 
     * @param protocol The protocol proxy to stop.
     */
    private static final void stopProxy(VersionedProtocol protocol) {
        if (protocol != null) {
            try {
                RPC.stopProxy(protocol);
            } catch (Throwable t) {
                LOG.error("Error while shutting down RPC proxy.", t);
            }
        }
    }

    /**
     * Determines the IP address of the interface from which the TaskManager can connect to the given JobManager
     * IP address.
     * 
     * @param jobManagerAddress The socket address to connect to.
     * @return The IP address of the interface that connects to the JobManager.
     * @throws IOException If no connection could be established.
     */
    private static InetAddress getTaskManagerAddress(InetSocketAddress jobManagerAddress) throws IOException {
        AddressDetectionState strategy = AddressDetectionState.ADDRESS;

        while (true) {
            Enumeration<NetworkInterface> e = NetworkInterface.getNetworkInterfaces();
            while (e.hasMoreElements()) {
                NetworkInterface n = e.nextElement();
                Enumeration<InetAddress> ee = n.getInetAddresses();
                while (ee.hasMoreElements()) {
                    InetAddress i = ee.nextElement();
                    switch (strategy) {
                    case ADDRESS:
                        if (hasCommonPrefix(jobManagerAddress.getAddress().getAddress(), i.getAddress())) {
                            if (tryToConnect(i, jobManagerAddress, strategy.getTimeout())) {
                                LOG.info("Determined " + i + " as the TaskTracker's own IP address");
                                return i;
                            }
                        }
                        break;
                    case FAST_CONNECT:
                    case SLOW_CONNECT:
                        boolean correct = tryToConnect(i, jobManagerAddress, strategy.getTimeout());
                        if (correct) {
                            LOG.info("Determined " + i + " as the TaskTracker's own IP address");
                            return i;
                        }
                        break;
                    default:
                        throw new RuntimeException("Unkown address detection strategy: " + strategy);
                    }
                }
            }
            // state control
            switch (strategy) {
            case ADDRESS:
                strategy = AddressDetectionState.FAST_CONNECT;
                break;
            case FAST_CONNECT:
                strategy = AddressDetectionState.SLOW_CONNECT;
                break;
            case SLOW_CONNECT:
                throw new RuntimeException("The TaskManager is unable to connect to the JobManager (Address: '"
                        + jobManagerAddress + "').");
            }
            if (LOG.isDebugEnabled()) {
                LOG.debug("Defaulting to detection strategy {}", strategy);
            }
        }
    }

    /**
     * Searches for an available free port and returns the port number.
     * 
     * @return An available port.
     * @throws RuntimeException Thrown, if no free port was found.
     */
    private static int getAvailablePort() {
        for (int i = 0; i < 50; i++) {
            ServerSocket serverSocket = null;
            try {
                serverSocket = new ServerSocket(0);
                int port = serverSocket.getLocalPort();
                if (port != 0) {
                    return port;
                }
            } catch (IOException e) {

                LOG.debug("Unable to allocate port with exception {}", e);
            } finally {
                if (serverSocket != null) {
                    try {
                        serverSocket.close();
                    } catch (Throwable t) {
                    }
                }
            }
        }

        throw new RuntimeException("Could not find a free permitted port on the machine.");
    }

    /**
     * Checks if two addresses have a common prefix (first 2 bytes).
     * Example: 192.168.???.???
     * Works also with ipv6, but accepts probably too many addresses
     */
    private static boolean hasCommonPrefix(byte[] address, byte[] address2) {
        return address[0] == address2[0] && address[1] == address2[1];
    }

    private static boolean tryToConnect(InetAddress fromAddress, SocketAddress toSocket, int timeout)
            throws IOException {
        if (LOG.isDebugEnabled()) {
            LOG.debug("Trying to connect to JobManager (" + toSocket + ") from local address " + fromAddress
                    + " with timeout " + timeout);
        }
        boolean connectable = true;
        Socket socket = null;
        try {
            socket = new Socket();
            SocketAddress bindP = new InetSocketAddress(fromAddress, 0); // 0 = let the OS choose the port on this
            // machine
            socket.bind(bindP);
            socket.connect(toSocket, timeout);
        } catch (Exception ex) {
            LOG.info("Failed to connect to JobManager from address '" + fromAddress + "': " + ex.getMessage());
            if (LOG.isDebugEnabled()) {
                LOG.debug("Failed with exception", ex);
            }
            connectable = false;
        } finally {
            if (socket != null) {
                socket.close();
            }
        }
        return connectable;
    }

    /**
     * The states of address detection mechanism.
     * There is only a state transition if the current state failed to determine the address.
     */
    private enum AddressDetectionState {
        ADDRESS(50), //detect own IP based on the JobManagers IP address. Look for common prefix
        FAST_CONNECT(50), //try to connect to the JobManager on all Interfaces and all their addresses.
        //this state uses a low timeout (say 50 ms) for fast detection.
        SLOW_CONNECT(1000); //same as FAST_CONNECT, but with a timeout of 1000 ms (1s).

        private int timeout;

        AddressDetectionState(int timeout) {
            this.timeout = timeout;
        }

        public int getTimeout() {
            return timeout;
        }
    }

    @Override
    public void killTaskManager() throws IOException {
        LOG.info("Killing TaskManager");
        System.exit(0); // returning 0 because the TM is not stopping in an error condition.
    }
}