org.apache.tajo.worker.ExecutionBlockContext.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.tajo.worker.ExecutionBlockContext.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.tajo.worker;

import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocalDirAllocator;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.tajo.ExecutionBlockId;
import org.apache.tajo.TajoProtos;
import org.apache.tajo.TaskAttemptId;
import org.apache.tajo.conf.TajoConf;
import org.apache.tajo.engine.query.QueryContext;
import org.apache.tajo.ipc.QueryMasterProtocol;
import org.apache.tajo.master.cluster.WorkerConnectionInfo;
import org.apache.tajo.rpc.NettyClientBase;
import org.apache.tajo.rpc.NullCallback;
import org.apache.tajo.rpc.RpcConnectionPool;
import org.apache.tajo.storage.HashShuffleAppenderManager;
import org.apache.tajo.storage.StorageUtil;
import org.apache.tajo.util.NetUtils;
import org.apache.tajo.util.Pair;

import io.netty.channel.ConnectTimeoutException;
import io.netty.channel.EventLoopGroup;

import java.io.IOException;
import java.net.InetSocketAddress;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;

import static org.apache.tajo.ipc.TajoWorkerProtocol.*;

public class ExecutionBlockContext {
    /** class logger */
    private static final Log LOG = LogFactory.getLog(ExecutionBlockContext.class);

    private TaskRunnerManager manager;
    public AtomicInteger completedTasksNum = new AtomicInteger();
    public AtomicInteger succeededTasksNum = new AtomicInteger();
    public AtomicInteger killedTasksNum = new AtomicInteger();
    public AtomicInteger failedTasksNum = new AtomicInteger();

    private EventLoopGroup loopGroup;
    // for temporal or intermediate files
    private FileSystem localFS;
    // for input files
    private FileSystem defaultFS;
    private ExecutionBlockId executionBlockId;
    private QueryContext queryContext;
    private TajoWorker.WorkerContext workerContext;
    private String plan;

    private ExecutionBlockSharedResource resource;

    private TajoQueryEngine queryEngine;
    private RpcConnectionPool connPool;
    private InetSocketAddress qmMasterAddr;
    private WorkerConnectionInfo queryMaster;
    private TajoConf systemConf;
    // for the doAs block
    private UserGroupInformation taskOwner;

    private Reporter reporter;

    private AtomicBoolean stop = new AtomicBoolean();

    // It keeps all of the query unit attempts while a TaskRunner is running.
    private final ConcurrentMap<TaskAttemptId, Task> tasks = Maps.newConcurrentMap();

    private final ConcurrentMap<String, TaskRunnerHistory> histories = Maps.newConcurrentMap();

    public ExecutionBlockContext(TajoConf conf, TajoWorker.WorkerContext workerContext, TaskRunnerManager manager,
            QueryContext queryContext, String plan, ExecutionBlockId executionBlockId,
            WorkerConnectionInfo queryMaster) throws Throwable {
        this.manager = manager;
        this.executionBlockId = executionBlockId;
        this.connPool = RpcConnectionPool.getPool();
        this.queryMaster = queryMaster;
        this.systemConf = conf;
        this.reporter = new Reporter();
        this.defaultFS = TajoConf.getTajoRootDir(systemConf).getFileSystem(systemConf);
        this.localFS = FileSystem.getLocal(systemConf);

        // Setup QueryEngine according to the query plan
        // Here, we can setup row-based query engine or columnar query engine.
        this.queryEngine = new TajoQueryEngine(systemConf);
        this.queryContext = queryContext;
        this.plan = plan;
        this.resource = new ExecutionBlockSharedResource();
        this.workerContext = workerContext;
    }

    public void init() throws Throwable {

        LOG.info("Tajo Root Dir: " + systemConf.getVar(TajoConf.ConfVars.ROOT_DIR));
        LOG.info("Worker Local Dir: " + systemConf.getVar(TajoConf.ConfVars.WORKER_TEMPORAL_DIR));

        this.qmMasterAddr = NetUtils.createSocketAddr(queryMaster.getHost(), queryMaster.getQueryMasterPort());
        LOG.info("QueryMaster Address:" + qmMasterAddr);

        UserGroupInformation.setConfiguration(systemConf);
        // TODO - 'load credential' should be implemented
        // Getting taskOwner
        UserGroupInformation taskOwner = UserGroupInformation
                .createRemoteUser(systemConf.getVar(TajoConf.ConfVars.USERNAME));

        // initialize DFS and LocalFileSystems
        this.taskOwner = taskOwner;
        this.reporter.startReporter();

        // resource intiailization
        try {
            this.resource.initialize(queryContext, plan);
        } catch (Throwable e) {
            try {
                NettyClientBase client = getQueryMasterConnection();
                try {
                    QueryMasterProtocol.QueryMasterProtocolService.Interface stub = client.getStub();
                    stub.killQuery(null, executionBlockId.getQueryId().getProto(), NullCallback.get());
                } finally {
                    connPool.releaseConnection(client);
                }
            } catch (Throwable t) {
                //ignore
            }
            throw e;
        }
    }

    public ExecutionBlockSharedResource getSharedResource() {
        return resource;
    }

    public NettyClientBase getQueryMasterConnection()
            throws NoSuchMethodException, ConnectTimeoutException, ClassNotFoundException {
        return connPool.getConnection(qmMasterAddr, QueryMasterProtocol.class, true);
    }

    public void releaseConnection(NettyClientBase connection) {
        connPool.releaseConnection(connection);
    }

    public void stop() {
        if (stop.getAndSet(true)) {
            return;
        }

        try {
            reporter.stop();
        } catch (InterruptedException e) {
            LOG.error(e);
        }

        // If ExecutionBlock is stopped, all running or pending tasks will be marked as failed.
        for (Task task : tasks.values()) {
            if (task.getStatus() == TajoProtos.TaskAttemptState.TA_PENDING
                    || task.getStatus() == TajoProtos.TaskAttemptState.TA_RUNNING) {
                task.setState(TajoProtos.TaskAttemptState.TA_FAILED);
                try {
                    task.abort();
                } catch (Throwable e) {
                    LOG.error(e, e);
                }
            }
        }
        tasks.clear();

        resource.release();
    }

    public TajoConf getConf() {
        return systemConf;
    }

    public FileSystem getLocalFS() {
        return localFS;
    }

    public FileSystem getDefaultFS() {
        return defaultFS;
    }

    public LocalDirAllocator getLocalDirAllocator() {
        return workerContext.getLocalDirAllocator();
    }

    public TajoQueryEngine getTQueryEngine() {
        return queryEngine;
    }

    // for the local temporal dir
    public Path createBaseDir() throws IOException {
        // the base dir for an output dir
        String baseDir = getBaseOutputDir(executionBlockId).toString();
        Path baseDirPath = localFS.makeQualified(getLocalDirAllocator().getLocalPathForWrite(baseDir, systemConf));
        return baseDirPath;
    }

    public static Path getBaseOutputDir(ExecutionBlockId executionBlockId) {
        Path workDir = StorageUtil.concatPath(executionBlockId.getQueryId().toString(), "output",
                String.valueOf(executionBlockId.getId()));
        return workDir;
    }

    public static Path getBaseInputDir(ExecutionBlockId executionBlockId) {
        Path workDir = StorageUtil.concatPath(executionBlockId.getQueryId().toString(), "in",
                executionBlockId.toString());
        return workDir;
    }

    public ExecutionBlockId getExecutionBlockId() {
        return executionBlockId;
    }

    public Map<TaskAttemptId, Task> getTasks() {
        return tasks;
    }

    public Task getTask(TaskAttemptId taskAttemptId) {
        return tasks.get(taskAttemptId);
    }

    public void stopTaskRunner(String id) {
        manager.stopTaskRunner(id);
    }

    public TaskRunner getTaskRunner(String taskRunnerId) {
        return manager.getTaskRunner(taskRunnerId);
    }

    public void addTaskHistory(String taskRunnerId, TaskAttemptId quAttemptId, TaskHistory taskHistory) {
        histories.get(taskRunnerId).addTaskHistory(quAttemptId, taskHistory);
    }

    public TaskRunnerHistory createTaskRunnerHistory(TaskRunner runner) {
        histories.putIfAbsent(runner.getId(), new TaskRunnerHistory(runner.getContainerId(), executionBlockId));
        return histories.get(runner.getId());
    }

    public TajoWorker.WorkerContext getWorkerContext() {
        return workerContext;
    }

    private void sendExecutionBlockReport(ExecutionBlockReport reporter) throws Exception {
        NettyClientBase client = getQueryMasterConnection();
        try {
            QueryMasterProtocol.QueryMasterProtocolService.Interface stub = client.getStub();
            stub.doneExecutionBlock(null, reporter, NullCallback.get());
        } finally {
            connPool.releaseConnection(client);
        }
    }

    protected void reportExecutionBlock(ExecutionBlockId ebId) {
        ExecutionBlockReport.Builder reporterBuilder = ExecutionBlockReport.newBuilder();
        reporterBuilder.setEbId(ebId.getProto());
        reporterBuilder.setReportSuccess(true);
        reporterBuilder.setSucceededTasks(succeededTasksNum.get());
        try {
            List<IntermediateEntryProto> intermediateEntries = Lists.newArrayList();
            List<HashShuffleAppenderManager.HashShuffleIntermediate> shuffles = getWorkerContext()
                    .getHashShuffleAppenderManager().close(ebId);
            if (shuffles == null) {
                reporterBuilder.addAllIntermediateEntries(intermediateEntries);
                sendExecutionBlockReport(reporterBuilder.build());
                return;
            }

            IntermediateEntryProto.Builder intermediateBuilder = IntermediateEntryProto.newBuilder();
            IntermediateEntryProto.PageProto.Builder pageBuilder = IntermediateEntryProto.PageProto.newBuilder();
            FailureIntermediateProto.Builder failureBuilder = FailureIntermediateProto.newBuilder();

            for (HashShuffleAppenderManager.HashShuffleIntermediate eachShuffle : shuffles) {
                List<IntermediateEntryProto.PageProto> pages = Lists.newArrayList();
                List<FailureIntermediateProto> failureIntermediateItems = Lists.newArrayList();

                for (Pair<Long, Integer> eachPage : eachShuffle.getPages()) {
                    pageBuilder.clear();
                    pageBuilder.setPos(eachPage.getFirst());
                    pageBuilder.setLength(eachPage.getSecond());
                    pages.add(pageBuilder.build());
                }

                for (Pair<Long, Pair<Integer, Integer>> eachFailure : eachShuffle.getFailureTskTupleIndexes()) {
                    failureBuilder.clear();
                    failureBuilder.setPagePos(eachFailure.getFirst());
                    failureBuilder.setStartRowNum(eachFailure.getSecond().getFirst());
                    failureBuilder.setEndRowNum(eachFailure.getSecond().getSecond());
                    failureIntermediateItems.add(failureBuilder.build());
                }
                intermediateBuilder.clear();

                intermediateBuilder.setEbId(ebId.getProto())
                        .setHost(getWorkerContext().getConnectionInfo().getHost() + ":"
                                + getWorkerContext().getConnectionInfo().getPullServerPort())
                        .setTaskId(-1).setAttemptId(-1).setPartId(eachShuffle.getPartId())
                        .setVolume(eachShuffle.getVolume()).addAllPages(pages)
                        .addAllFailures(failureIntermediateItems);
                intermediateEntries.add(intermediateBuilder.build());
            }

            // send intermediateEntries to QueryMaster
            reporterBuilder.addAllIntermediateEntries(intermediateEntries);

        } catch (Throwable e) {
            LOG.error(e.getMessage(), e);
            reporterBuilder.setReportSuccess(false);
            if (e.getMessage() == null) {
                reporterBuilder.setReportErrorMessage(e.getClass().getSimpleName());
            } else {
                reporterBuilder.setReportErrorMessage(e.getMessage());
            }
        }
        try {
            sendExecutionBlockReport(reporterBuilder.build());
        } catch (Throwable e) {
            // can't send report to query master
            LOG.fatal(e.getMessage(), e);
            throw new RuntimeException(e);
        }
    }

    protected class Reporter {
        private Thread reporterThread;
        private AtomicBoolean reporterStop = new AtomicBoolean();
        private static final int PROGRESS_INTERVAL = 1000;
        private static final int MAX_RETRIES = 10;

        public Reporter() {
            this.reporterThread = new Thread(createReporterThread());
            this.reporterThread.setName("Task reporter");
        }

        public void startReporter() {
            this.reporterThread.start();
        }

        Runnable createReporterThread() {

            return new Runnable() {
                int remainingRetries = MAX_RETRIES;

                @Override
                public void run() {
                    while (!reporterStop.get() && !Thread.interrupted()) {

                        NettyClientBase client = null;
                        try {
                            client = getQueryMasterConnection();
                            QueryMasterProtocol.QueryMasterProtocolService.Interface masterStub = client.getStub();

                            if (tasks.size() == 0) {
                                masterStub.ping(null, getExecutionBlockId().getProto(), NullCallback.get());
                            } else {
                                for (Task task : new ArrayList<Task>(tasks.values())) {

                                    if (task.isRunning() && task.isProgressChanged()) {
                                        task.updateProgress();
                                        masterStub.statusUpdate(null, task.getReport(), NullCallback.get());
                                        task.getContext().setProgressChanged(false);
                                    } else {
                                        task.updateProgress();
                                    }
                                }
                            }
                        } catch (Throwable t) {
                            LOG.error(t.getMessage(), t);
                            remainingRetries -= 1;
                            if (remainingRetries == 0) {
                                ReflectionUtils.logThreadInfo(LOG, "Communication exception", 0);
                                LOG.warn("Last retry, exiting ");
                                throw new RuntimeException(t);
                            }
                        } finally {
                            releaseConnection(client);
                            if (remainingRetries > 0 && !reporterStop.get()) {
                                synchronized (reporterThread) {
                                    try {
                                        reporterThread.wait(PROGRESS_INTERVAL);
                                    } catch (InterruptedException e) {
                                    }
                                }
                            }
                        }
                    }
                }
            };
        }

        public void stop() throws InterruptedException {
            if (reporterStop.getAndSet(true)) {
                return;
            }

            if (reporterThread != null) {
                // Intent of the lock is to not send an interupt in the middle of an
                // umbilical.ping or umbilical.statusUpdate
                synchronized (reporterThread) {
                    //Interrupt if sleeping. Otherwise wait for the RPC call to return.
                    reporterThread.notifyAll();
                }
            }
        }
    }
}