org.apache.tez.dag.history.logging.ats.ATSV15HistoryLoggingService.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.tez.dag.history.logging.ats.ATSV15HistoryLoggingService.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.tez.dag.history.logging.ats;

import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;

import org.apache.hadoop.yarn.api.records.timeline.TimelineEntityGroupId;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.yarn.api.records.timeline.TimelineEntity;
import org.apache.hadoop.yarn.api.records.timeline.TimelinePutResponse;
import org.apache.hadoop.yarn.api.records.timeline.TimelinePutResponse.TimelinePutError;
import org.apache.hadoop.yarn.client.api.TimelineClient;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.tez.common.ReflectionUtils;
import org.apache.tez.common.TezUtilsInternal;
import org.apache.tez.common.security.HistoryACLPolicyManager;
import org.apache.tez.dag.api.TezConfiguration;
import org.apache.tez.dag.api.TezConstants;
import org.apache.tez.dag.api.TezReflectionException;
import org.apache.tez.dag.history.DAGHistoryEvent;
import org.apache.tez.dag.history.HistoryEventType;
import org.apache.tez.dag.history.events.DAGSubmittedEvent;
import org.apache.tez.dag.history.logging.HistoryLoggingService;
import org.apache.tez.dag.history.events.DAGRecoveredEvent;
import org.apache.tez.dag.history.logging.EntityTypes;
import org.apache.tez.dag.records.TezDAGID;

import com.google.common.annotations.VisibleForTesting;

public class ATSV15HistoryLoggingService extends HistoryLoggingService {

    private static final Logger LOG = LoggerFactory.getLogger(ATSV15HistoryLoggingService.class);

    private LinkedBlockingQueue<DAGHistoryEvent> eventQueue = new LinkedBlockingQueue<DAGHistoryEvent>();

    private Thread eventHandlingThread;
    private AtomicBoolean stopped = new AtomicBoolean(false);
    private int eventCounter = 0;
    private int eventsProcessed = 0;
    private final Object lock = new Object();
    private boolean historyLoggingEnabled = true;

    @VisibleForTesting
    TimelineClient timelineClient;

    private HashSet<TezDAGID> skippedDAGs = new HashSet<TezDAGID>();
    private Map<TezDAGID, String> dagDomainIdMap = new HashMap<TezDAGID, String>();
    private long maxTimeToWaitOnShutdown;
    private boolean waitForeverOnShutdown = false;

    private long maxPollingTimeMillis;

    private String sessionDomainId;
    private static final String atsHistoryLoggingServiceClassName = ATSV15HistoryLoggingService.class.getName();
    private static final String atsHistoryACLManagerClassName = "org.apache.tez.dag.history.ats.acls.ATSV15HistoryACLPolicyManager";
    private HistoryACLPolicyManager historyACLPolicyManager;

    public ATSV15HistoryLoggingService() {
        super(ATSV15HistoryLoggingService.class.getName());
    }

    @Override
    public void serviceInit(Configuration serviceConf) throws Exception {
        Configuration conf = new Configuration(serviceConf);

        String summaryEntityTypesStr = EntityTypes.TEZ_APPLICATION + "," + EntityTypes.TEZ_APPLICATION_ATTEMPT + ","
                + EntityTypes.TEZ_DAG_ID;

        // Ensure that summary entity types are defined properly for Tez.
        if (conf.getBoolean(TezConfiguration.TEZ_AM_ATS_V15_OVERRIDE_SUMMARY_TYPES,
                TezConfiguration.TEZ_AM_ATS_V15_OVERRIDE_SUMMARY_TYPES_DEFAULT)) {
            conf.set(YarnConfiguration.TIMELINE_SERVICE_ENTITYGROUP_FS_STORE_SUMMARY_ENTITY_TYPES,
                    summaryEntityTypesStr);
        }

        historyLoggingEnabled = conf.getBoolean(TezConfiguration.TEZ_AM_HISTORY_LOGGING_ENABLED,
                TezConfiguration.TEZ_AM_HISTORY_LOGGING_ENABLED_DEFAULT);
        if (!historyLoggingEnabled) {
            LOG.info("ATSService: History Logging disabled. " + TezConfiguration.TEZ_AM_HISTORY_LOGGING_ENABLED
                    + " set to false");
            return;
        }

        if (conf.getBoolean(YarnConfiguration.TIMELINE_SERVICE_ENABLED,
                YarnConfiguration.DEFAULT_TIMELINE_SERVICE_ENABLED)) {
            timelineClient = TimelineClient.createTimelineClient();
            timelineClient.init(conf);
        } else {
            this.timelineClient = null;
            if (conf.get(TezConfiguration.TEZ_HISTORY_LOGGING_SERVICE_CLASS, "")
                    .equals(atsHistoryLoggingServiceClassName)) {
                LOG.warn(atsHistoryLoggingServiceClassName + " is disabled due to Timeline Service being disabled, "
                        + YarnConfiguration.TIMELINE_SERVICE_ENABLED + " set to false");
            }
        }
        maxTimeToWaitOnShutdown = conf.getLong(TezConfiguration.YARN_ATS_EVENT_FLUSH_TIMEOUT_MILLIS,
                TezConfiguration.YARN_ATS_EVENT_FLUSH_TIMEOUT_MILLIS_DEFAULT);
        maxPollingTimeMillis = conf.getInt(TezConfiguration.YARN_ATS_MAX_POLLING_TIME_PER_EVENT,
                TezConfiguration.YARN_ATS_MAX_POLLING_TIME_PER_EVENT_DEFAULT);
        if (maxTimeToWaitOnShutdown < 0) {
            waitForeverOnShutdown = true;
        }
        sessionDomainId = conf.get(TezConfiguration.YARN_ATS_ACL_SESSION_DOMAIN_ID);

        LOG.info("Initializing " + ATSV15HistoryLoggingService.class.getSimpleName() + " with "
                + ", maxPollingTime(ms)=" + maxPollingTimeMillis + ", waitTimeForShutdown(ms)="
                + maxTimeToWaitOnShutdown + ", TimelineACLManagerClass=" + atsHistoryACLManagerClassName);

        try {
            historyACLPolicyManager = ReflectionUtils.createClazzInstance(atsHistoryACLManagerClassName);
            historyACLPolicyManager.setConf(conf);
        } catch (TezReflectionException e) {
            LOG.warn("Could not instantiate object for " + atsHistoryACLManagerClassName
                    + ". ACLs cannot be enforced correctly for history data in Timeline", e);
            if (!conf.getBoolean(TezConfiguration.TEZ_AM_ALLOW_DISABLED_TIMELINE_DOMAINS,
                    TezConfiguration.TEZ_AM_ALLOW_DISABLED_TIMELINE_DOMAINS_DEFAULT)) {
                throw e;
            }
            historyACLPolicyManager = null;
        }

    }

    @Override
    public void serviceStart() {
        if (!historyLoggingEnabled || timelineClient == null) {
            return;
        }
        timelineClient.start();

        eventHandlingThread = new Thread(new Runnable() {
            @Override
            public void run() {
                boolean interrupted = false;
                TezUtilsInternal.setHadoopCallerContext(appContext.getHadoopShim(), appContext.getApplicationID());
                while (!stopped.get() && !Thread.currentThread().isInterrupted() && !interrupted) {

                    // Log the size of the event-queue every so often.
                    if (eventCounter != 0 && eventCounter % 1000 == 0) {
                        if (eventsProcessed != 0 && !eventQueue.isEmpty()) {
                            LOG.info("Event queue stats" + ", eventsProcessedSinceLastUpdate=" + eventsProcessed
                                    + ", eventQueueSize=" + eventQueue.size());
                        }
                        eventCounter = 0;
                        eventsProcessed = 0;
                    } else {
                        ++eventCounter;
                    }

                    synchronized (lock) {
                        try {
                            DAGHistoryEvent event = eventQueue.poll(maxPollingTimeMillis, TimeUnit.MILLISECONDS);
                            if (event == null) {
                                continue;
                            }
                            if (!isValidEvent(event)) {
                                continue;
                            }

                            try {
                                handleEvents(event);
                                eventsProcessed += 1;
                            } catch (Exception e) {
                                LOG.warn("Error handling events", e);
                            }
                        } catch (InterruptedException e) {
                            // Finish processing events and then return
                            interrupted = true;
                        }
                    }
                }
            }
        }, "HistoryEventHandlingThread");
        eventHandlingThread.start();
    }

    @Override
    public void serviceStop() {
        if (!historyLoggingEnabled || timelineClient == null) {
            return;
        }
        LOG.info("Stopping ATSService" + ", eventQueueBacklog=" + eventQueue.size());
        stopped.set(true);
        if (eventHandlingThread != null) {
            eventHandlingThread.interrupt();
        }
        try {
            TezUtilsInternal.setHadoopCallerContext(appContext.getHadoopShim(), appContext.getApplicationID());
            synchronized (lock) {
                if (!eventQueue.isEmpty()) {
                    LOG.warn("ATSService being stopped" + ", eventQueueBacklog=" + eventQueue.size()
                            + ", maxTimeLeftToFlush=" + maxTimeToWaitOnShutdown + ", waitForever="
                            + waitForeverOnShutdown);
                    long startTime = appContext.getClock().getTime();
                    long endTime = startTime + maxTimeToWaitOnShutdown;
                    while (waitForeverOnShutdown || (endTime >= appContext.getClock().getTime())) {
                        try {
                            DAGHistoryEvent event = eventQueue.poll(maxPollingTimeMillis, TimeUnit.MILLISECONDS);
                            if (event == null) {
                                LOG.info("Event queue empty, stopping ATS Service");
                                break;
                            }
                            if (!isValidEvent(event)) {
                                continue;
                            }
                            try {
                                handleEvents(event);
                            } catch (Exception e) {
                                LOG.warn("Error handling event", e);
                            }
                        } catch (InterruptedException e) {
                            LOG.info("ATSService interrupted while shutting down. Exiting." + " EventQueueBacklog="
                                    + eventQueue.size());
                        }
                    }
                }
            }
        } finally {
            appContext.getHadoopShim().clearHadoopCallerContext();
        }
        if (!eventQueue.isEmpty()) {
            LOG.warn("Did not finish flushing eventQueue before stopping ATSService" + ", eventQueueBacklog="
                    + eventQueue.size());
        }
        timelineClient.stop();
        if (historyACLPolicyManager != null) {
            historyACLPolicyManager.close();
        }

    }

    @VisibleForTesting
    public TimelineEntityGroupId getGroupId(DAGHistoryEvent event) {
        // Changing this function will impact TimelineCachePluginImpl and should be done very
        // carefully to account for handling different versions of Tez
        switch (event.getHistoryEvent().getEventType()) {
        case DAG_SUBMITTED:
        case DAG_INITIALIZED:
        case DAG_STARTED:
        case DAG_FINISHED:
        case DAG_KILL_REQUEST:
        case VERTEX_INITIALIZED:
        case VERTEX_STARTED:
        case VERTEX_CONFIGURE_DONE:
        case VERTEX_FINISHED:
        case TASK_STARTED:
        case TASK_FINISHED:
        case TASK_ATTEMPT_STARTED:
        case TASK_ATTEMPT_FINISHED:
        case DAG_COMMIT_STARTED:
        case VERTEX_COMMIT_STARTED:
        case VERTEX_GROUP_COMMIT_STARTED:
        case VERTEX_GROUP_COMMIT_FINISHED:
        case DAG_RECOVERED:
            return TimelineEntityGroupId.newInstance(event.getDagID().getApplicationId(),
                    event.getDagID().toString());
        case APP_LAUNCHED:
        case AM_LAUNCHED:
        case AM_STARTED:
        case CONTAINER_LAUNCHED:
        case CONTAINER_STOPPED:
            return TimelineEntityGroupId.newInstance(appContext.getApplicationID(),
                    appContext.getApplicationID().toString());
        }
        return null;
    }

    public void handle(DAGHistoryEvent event) {
        if (historyLoggingEnabled && timelineClient != null) {
            eventQueue.add(event);
        }
    }

    private boolean isValidEvent(DAGHistoryEvent event) {
        HistoryEventType eventType = event.getHistoryEvent().getEventType();
        TezDAGID dagId = event.getDagID();

        if (eventType.equals(HistoryEventType.DAG_SUBMITTED)) {
            DAGSubmittedEvent dagSubmittedEvent = (DAGSubmittedEvent) event.getHistoryEvent();
            String dagName = dagSubmittedEvent.getDAGName();
            if ((dagName != null && dagName.startsWith(TezConstants.TEZ_PREWARM_DAG_NAME_PREFIX))
                    || (!dagSubmittedEvent.isHistoryLoggingEnabled())) {
                // Skip recording pre-warm DAG events
                skippedDAGs.add(dagId);
                return false;
            }
            if (historyACLPolicyManager != null) {
                String dagDomainId = dagSubmittedEvent.getConf().get(TezConfiguration.YARN_ATS_ACL_DAG_DOMAIN_ID);
                if (dagDomainId != null) {
                    dagDomainIdMap.put(dagId, dagDomainId);
                }
            }
        }
        if (eventType.equals(HistoryEventType.DAG_RECOVERED)) {
            DAGRecoveredEvent dagRecoveredEvent = (DAGRecoveredEvent) event.getHistoryEvent();
            if (!dagRecoveredEvent.isHistoryLoggingEnabled()) {
                skippedDAGs.add(dagRecoveredEvent.getDagID());
                return false;
            }
        }
        if (eventType.equals(HistoryEventType.DAG_FINISHED)) {
            // Remove from set to keep size small
            // No more events should be seen after this point.
            if (skippedDAGs.remove(dagId)) {
                return false;
            }
        }

        if (dagId != null && skippedDAGs.contains(dagId)) {
            // Skip pre-warm DAGs
            return false;
        }

        return true;
    }

    private void handleEvents(DAGHistoryEvent event) {
        String domainId = sessionDomainId;
        TezDAGID dagId = event.getDagID();

        if (historyACLPolicyManager != null && dagId != null) {
            if (dagDomainIdMap.containsKey(dagId)) {
                domainId = dagDomainIdMap.get(dagId);
            }
        }

        TimelineEntity entity = HistoryEventTimelineConversion.convertToTimelineEntity(event.getHistoryEvent());

        if (historyACLPolicyManager != null) {
            if (HistoryEventType.isDAGSpecificEvent(event.getHistoryEvent().getEventType())) {
                if (domainId != null && !domainId.isEmpty()) {
                    historyACLPolicyManager.updateTimelineEntityDomain(entity, domainId);
                }
            } else {
                if (sessionDomainId != null && !sessionDomainId.isEmpty()) {
                    historyACLPolicyManager.updateTimelineEntityDomain(entity, sessionDomainId);
                }
            }
        }

        try {
            TimelineEntityGroupId groupId = getGroupId(event);
            TimelinePutResponse response = timelineClient.putEntities(appContext.getApplicationAttemptId(), groupId,
                    entity);
            if (response != null && !response.getErrors().isEmpty()) {
                int count = response.getErrors().size();
                for (int i = 0; i < count; ++i) {
                    TimelinePutError err = response.getErrors().get(i);
                    if (err.getErrorCode() != 0) {
                        LOG.warn("Could not post history event to ATS" + ", atsPutError=" + err.getErrorCode()
                                + ", entityId=" + err.getEntityId());
                    }
                }
            }
            // Do nothing additional, ATS client library should handle throttling
            // or auto-disable as needed
        } catch (Exception e) {
            LOG.warn("Could not handle history events", e);
        }

    }

}