org.apache.tez.analyzer.TestAnalyzer.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.tez.analyzer.TestAnalyzer.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.tez.analyzer;

import static org.junit.Assert.assertTrue;

import java.io.File;
import java.util.Collections;
import java.util.List;
import java.util.Random;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.DFSConfigKeys;
import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.hadoop.hdfs.server.namenode.EditLogFileOutputStream;
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.tez.analyzer.plugins.CriticalPathAnalyzer;
import org.apache.tez.analyzer.plugins.CriticalPathAnalyzer.CriticalPathDependency;
import org.apache.tez.analyzer.plugins.CriticalPathAnalyzer.CriticalPathStep;
import org.apache.tez.analyzer.plugins.CriticalPathAnalyzer.CriticalPathStep.EntityType;
import org.apache.tez.client.TezClient;
import org.apache.tez.dag.api.DAG;
import org.apache.tez.dag.api.TezConfiguration;
import org.apache.tez.dag.api.TezConstants;
import org.apache.tez.dag.api.client.DAGClient;
import org.apache.tez.dag.api.client.DAGStatus;
import org.apache.tez.dag.history.logging.ats.ATSHistoryLoggingService;
import org.apache.tez.dag.history.logging.impl.SimpleHistoryLoggingService;
import org.apache.tez.dag.records.TaskAttemptTerminationCause;
import org.apache.tez.dag.records.TezDAGID;
import org.apache.tez.history.ATSImportTool;
import org.apache.tez.history.parser.ATSFileParser;
import org.apache.tez.history.parser.SimpleHistoryParser;
import org.apache.tez.history.parser.datamodel.DagInfo;
import org.apache.tez.test.SimpleTestDAG;
import org.apache.tez.test.SimpleTestDAG3Vertices;
import org.apache.tez.test.TestInput;
import org.apache.tez.test.TestProcessor;
import org.apache.tez.test.dag.SimpleReverseVTestDAG;
import org.apache.tez.test.dag.SimpleVTestDAG;
import org.apache.tez.tests.MiniTezClusterWithTimeline;
import org.junit.AfterClass;
import org.junit.Assert;
import org.junit.BeforeClass;
import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.base.Joiner;
import com.google.common.collect.Lists;

public class TestAnalyzer {
    private static final Logger LOG = LoggerFactory.getLogger(TestAnalyzer.class);

    private static String TEST_ROOT_DIR = "target" + Path.SEPARATOR + TestAnalyzer.class.getName() + "-tmpDir";
    private static String DOWNLOAD_DIR = TEST_ROOT_DIR + Path.SEPARATOR + "download";
    private final static String SIMPLE_HISTORY_DIR = "/tmp/simplehistory/";
    private final static String HISTORY_TXT = "history.txt";

    private static MiniDFSCluster dfsCluster;
    private static MiniTezClusterWithTimeline miniTezCluster;

    private static Configuration conf = new Configuration();
    private static FileSystem fs;

    private static TezClient tezSession = null;

    private boolean usingATS = true;
    private boolean downloadedSimpleHistoryFile = false;
    private static String yarnTimelineAddress;

    @BeforeClass
    public static void setupClass() throws Exception {
        conf = new Configuration();
        conf.setBoolean(DFSConfigKeys.DFS_NAMENODE_EDITS_NOEDITLOGCHANNELFLUSH, false);
        EditLogFileOutputStream.setShouldSkipFsyncForTesting(true);
        conf.set(MiniDFSCluster.HDFS_MINIDFS_BASEDIR, TEST_ROOT_DIR);
        dfsCluster = new MiniDFSCluster.Builder(conf).numDataNodes(1).format(true).build();
        fs = dfsCluster.getFileSystem();
        conf.set("fs.defaultFS", fs.getUri().toString());

        setupTezCluster();
    }

    @AfterClass
    public static void tearDownClass() throws Exception {
        LOG.info("Stopping mini clusters");
        if (miniTezCluster != null) {
            miniTezCluster.stop();
            miniTezCluster = null;
        }
        if (dfsCluster != null) {
            dfsCluster.shutdown();
            dfsCluster = null;
        }
    }

    private CriticalPathAnalyzer setupCPAnalyzer() {
        Configuration analyzerConf = new Configuration(false);
        analyzerConf.setBoolean(CriticalPathAnalyzer.DRAW_SVG, false);
        CriticalPathAnalyzer cp = new CriticalPathAnalyzer();
        cp.setConf(analyzerConf);
        return cp;
    }

    private static void setupTezCluster() throws Exception {
        // make the test run faster by speeding heartbeat frequency
        conf.setInt(YarnConfiguration.RM_NM_HEARTBEAT_INTERVAL_MS, 100);
        conf.setBoolean(YarnConfiguration.TIMELINE_SERVICE_ENABLED, true);
        conf.setBoolean(TezConfiguration.TEZ_AM_ALLOW_DISABLED_TIMELINE_DOMAINS, true);
        conf.set(TezConfiguration.TEZ_HISTORY_LOGGING_SERVICE_CLASS, ATSHistoryLoggingService.class.getName());

        miniTezCluster = new MiniTezClusterWithTimeline(TestAnalyzer.class.getName(), 1, 1, 1, true);

        miniTezCluster.init(conf);
        miniTezCluster.start();
        yarnTimelineAddress = miniTezCluster.getConfig().get(YarnConfiguration.TIMELINE_SERVICE_WEBAPP_ADDRESS);
    }

    private TezConfiguration createCommonTezLog() throws Exception {
        TezConfiguration tezConf = new TezConfiguration(miniTezCluster.getConfig());

        tezConf.setInt(TezConfiguration.TEZ_AM_RM_HEARTBEAT_INTERVAL_MS_MAX, 100);
        Path remoteStagingDir = dfsCluster.getFileSystem()
                .makeQualified(new Path(TEST_ROOT_DIR, String.valueOf(new Random().nextInt(100000))));

        tezConf.set(TezConfiguration.TEZ_AM_STAGING_DIR, remoteStagingDir.toString());
        tezConf.setBoolean(TezConfiguration.TEZ_AM_NODE_BLACKLISTING_ENABLED, false);

        return tezConf;
    }

    private void createTezSessionATS() throws Exception {
        TezConfiguration tezConf = createCommonTezLog();
        tezConf.setBoolean(YarnConfiguration.TIMELINE_SERVICE_ENABLED, true);
        tezConf.set(YarnConfiguration.TIMELINE_SERVICE_WEBAPP_ADDRESS,
                miniTezCluster.getConfig().get(YarnConfiguration.TIMELINE_SERVICE_WEBAPP_ADDRESS));
        tezConf.setBoolean(TezConfiguration.TEZ_AM_ALLOW_DISABLED_TIMELINE_DOMAINS, true);
        tezConf.set(TezConfiguration.TEZ_HISTORY_LOGGING_SERVICE_CLASS, ATSHistoryLoggingService.class.getName());

        Path remoteStagingDir = dfsCluster.getFileSystem()
                .makeQualified(new Path(TEST_ROOT_DIR, String.valueOf(new Random().nextInt(100000))));

        tezConf.set(TezConfiguration.TEZ_AM_STAGING_DIR, remoteStagingDir.toString());
        tezConf.setBoolean(TezConfiguration.TEZ_AM_NODE_BLACKLISTING_ENABLED, false);

        tezSession = TezClient.create("TestAnalyzer", tezConf, true);
        tezSession.start();
    }

    private void createTezSessionSimpleHistory() throws Exception {
        TezConfiguration tezConf = createCommonTezLog();
        tezConf.set(TezConfiguration.TEZ_HISTORY_LOGGING_SERVICE_CLASS,
                SimpleHistoryLoggingService.class.getName());

        tezConf.set(TezConfiguration.TEZ_SIMPLE_HISTORY_LOGGING_DIR, SIMPLE_HISTORY_DIR);

        Path remoteStagingDir = dfsCluster.getFileSystem()
                .makeQualified(new Path(TEST_ROOT_DIR, String.valueOf(new Random().nextInt(100000))));

        tezConf.set(TezConfiguration.TEZ_AM_STAGING_DIR, remoteStagingDir.toString());
        tezConf.setBoolean(TezConfiguration.TEZ_AM_NODE_BLACKLISTING_ENABLED, false);

        tezSession = TezClient.create("TestFaultTolerance", tezConf, true);
        tezSession.start();
    }

    private StepCheck createStep(String attempt, CriticalPathDependency reason) {
        return createStep(attempt, reason, null, null);
    }

    private StepCheck createStep(String attempt, CriticalPathDependency reason,
            TaskAttemptTerminationCause errCause, List<String> notes) {
        return new StepCheck(attempt, reason, errCause, notes);
    }

    private class StepCheck {
        String attempt; // attempt is the TaskAttemptInfo short name with regex
        CriticalPathDependency reason;
        TaskAttemptTerminationCause errCause;
        List<String> notesStr;

        StepCheck(String attempt, CriticalPathDependency reason, TaskAttemptTerminationCause cause,
                List<String> notes) {
            this.attempt = attempt;
            this.reason = reason;
            this.errCause = cause;
            this.notesStr = notes;
        }

        String getAttemptDetail() {
            return attempt;
        }

        CriticalPathDependency getReason() {
            return reason;
        }

        TaskAttemptTerminationCause getErrCause() {
            return errCause;
        }

        List<String> getNotesStr() {
            return notesStr;
        }
    }

    private void runDAG(DAG dag, DAGStatus.State finalState) throws Exception {
        tezSession.waitTillReady();
        LOG.info("ABC Running DAG name: " + dag.getName());
        DAGClient dagClient = tezSession.submitDAG(dag);
        DAGStatus dagStatus = dagClient.getDAGStatus(null);
        while (!dagStatus.isCompleted()) {
            LOG.info("Waiting for dag to complete. Sleeping for 500ms." + " DAG name: " + dag.getName()
                    + " DAG appContext: " + dagClient.getExecutionContext() + " Current state: "
                    + dagStatus.getState());
            Thread.sleep(100);
            dagStatus = dagClient.getDAGStatus(null);
        }

        Assert.assertEquals(finalState, dagStatus.getState());
    }

    private void verify(ApplicationId appId, int dagNum, List<StepCheck[]> steps) throws Exception {
        String dagId = TezDAGID.getInstance(appId, dagNum).toString();
        DagInfo dagInfo = getDagInfo(dagId);

        verifyCriticalPath(dagInfo, steps);
    }

    private DagInfo getDagInfo(String dagId) throws Exception {
        // sleep for a bit to let ATS events be sent from AM
        DagInfo dagInfo = null;
        if (usingATS) {
            //Export the data from ATS
            String[] args = { "--dagId=" + dagId, "--downloadDir=" + DOWNLOAD_DIR,
                    "--yarnTimelineAddress=" + yarnTimelineAddress };

            int result = ATSImportTool.process(args);
            assertTrue(result == 0);

            //Parse ATS data and verify results
            //Parse downloaded contents
            File downloadedFile = new File(DOWNLOAD_DIR + Path.SEPARATOR + dagId + ".zip");
            ATSFileParser parser = new ATSFileParser(downloadedFile);
            dagInfo = parser.getDAGData(dagId);
            assertTrue(dagInfo.getDagId().equals(dagId));
        } else {
            if (!downloadedSimpleHistoryFile) {
                downloadedSimpleHistoryFile = true;
                TezDAGID tezDAGID = TezDAGID.fromString(dagId);
                ApplicationAttemptId applicationAttemptId = ApplicationAttemptId
                        .newInstance(tezDAGID.getApplicationId(), 1);
                Path historyPath = new Path(miniTezCluster.getConfig().get("fs.defaultFS") + SIMPLE_HISTORY_DIR
                        + HISTORY_TXT + "." + applicationAttemptId);
                FileSystem fs = historyPath.getFileSystem(miniTezCluster.getConfig());

                Path localPath = new Path(DOWNLOAD_DIR, HISTORY_TXT);
                fs.copyToLocalFile(historyPath, localPath);
            }
            //Now parse via SimpleHistory
            File localFile = new File(DOWNLOAD_DIR, HISTORY_TXT);
            SimpleHistoryParser parser = new SimpleHistoryParser(localFile);
            dagInfo = parser.getDAGData(dagId);
            assertTrue(dagInfo.getDagId().equals(dagId));
        }
        return dagInfo;
    }

    private void verifyCriticalPath(DagInfo dagInfo, List<StepCheck[]> stepsOptions) throws Exception {
        CriticalPathAnalyzer cp = setupCPAnalyzer();
        cp.analyze(dagInfo);

        List<CriticalPathStep> criticalPath = cp.getCriticalPath();

        for (CriticalPathStep step : criticalPath) {
            LOG.info("ABC Step: " + step.getType());
            if (step.getType() == EntityType.ATTEMPT) {
                LOG.info("ABC Attempt: " + step.getAttempt().getShortName() + " "
                        + step.getAttempt().getDetailedStatus());
            }
            LOG.info("ABC Reason: " + step.getReason());
            String notes = Joiner.on(";").join(step.getNotes());
            LOG.info("ABC Notes: " + notes);
        }

        boolean foundMatchingLength = false;
        for (StepCheck[] steps : stepsOptions) {
            if (steps.length + 2 == criticalPath.size()) {
                foundMatchingLength = true;
                Assert.assertEquals(CriticalPathStep.EntityType.VERTEX_INIT, criticalPath.get(0).getType());
                Assert.assertEquals(criticalPath.get(1).getAttempt().getShortName(),
                        criticalPath.get(0).getAttempt().getShortName());

                for (int i = 1; i < criticalPath.size() - 1; ++i) {
                    StepCheck check = steps[i - 1];
                    CriticalPathStep step = criticalPath.get(i);
                    Assert.assertEquals(CriticalPathStep.EntityType.ATTEMPT, step.getType());
                    Assert.assertTrue(check.getAttemptDetail(),
                            step.getAttempt().getShortName().matches(check.getAttemptDetail()));
                    Assert.assertEquals(steps[i - 1].getReason(), step.getReason());
                    if (check.getErrCause() != null) {
                        Assert.assertEquals(check.getErrCause(),
                                TaskAttemptTerminationCause.valueOf(step.getAttempt().getTerminationCause()));
                    }
                    if (check.getNotesStr() != null) {
                        String notes = Joiner.on("#").join(step.getNotes());
                        for (String note : check.getNotesStr()) {
                            Assert.assertTrue(note, notes.contains(notes));
                        }
                    }
                }

                Assert.assertEquals(CriticalPathStep.EntityType.DAG_COMMIT,
                        criticalPath.get(criticalPath.size() - 1).getType());
                break;
            }
        }

        Assert.assertTrue(foundMatchingLength);

    }

    @Test(timeout = 300000)
    public void testWithATS() throws Exception {
        usingATS = true;
        createTezSessionATS();
        runTests();
    }

    @Test(timeout = 300000)
    public void testWithSimpleHistory() throws Exception {
        usingATS = false;
        createTezSessionSimpleHistory();
        runTests();
    }

    private void runTests() throws Exception {
        ApplicationId appId = tezSession.getAppMasterApplicationId();
        List<List<StepCheck[]>> stepsOptions = Lists.newArrayList();
        // run all test dags
        stepsOptions.add(testAttemptOfDownstreamVertexConnectedWithTwoUpstreamVerticesFailure());
        stepsOptions.add(testInputFailureCausesRerunOfTwoVerticesWithoutExit());
        stepsOptions.add(testMultiVersionInputFailureWithoutExit());
        stepsOptions.add(testCascadingInputFailureWithoutExitSuccess());
        stepsOptions.add(testTaskMultipleFailures());
        stepsOptions.add(testBasicInputFailureWithoutExit());
        stepsOptions.add(testBasicTaskFailure());
        stepsOptions.add(testBasicSuccessScatterGather());
        stepsOptions.add(testMultiVersionInputFailureWithExit());
        stepsOptions.add(testBasicInputFailureWithExit());
        stepsOptions.add(testInputFailureRerunCanSendOutputToTwoDownstreamVertices());
        stepsOptions.add(testCascadingInputFailureWithExitSuccess());
        stepsOptions.add(testInternalPreemption());

        // close session to flush
        if (tezSession != null) {
            tezSession.stop();
        }
        Thread.sleep((TezConstants.TEZ_DAG_SLEEP_TIME_BEFORE_EXIT * 3) / 2);

        // verify all dags
        for (int i = 0; i < stepsOptions.size(); ++i) {
            verify(appId, i + 1, stepsOptions.get(i));
        }
    }

    private List<StepCheck[]> testBasicSuccessScatterGather() throws Exception {
        Configuration testConf = new Configuration(false);
        testConf.setInt(SimpleTestDAG.TEZ_SIMPLE_DAG_NUM_TASKS, 1);
        StepCheck[] check = { createStep("v1 : 000000_0", CriticalPathDependency.INIT_DEPENDENCY),
                createStep("v2 : 000000_0", CriticalPathDependency.DATA_DEPENDENCY) };
        DAG dag = SimpleTestDAG.createDAG("testBasicSuccessScatterGather", testConf);
        runDAG(dag, DAGStatus.State.SUCCEEDED);
        return Collections.singletonList(check);
    }

    private List<StepCheck[]> testBasicTaskFailure() throws Exception {
        Configuration testConf = new Configuration(false);
        testConf.setInt(SimpleTestDAG.TEZ_SIMPLE_DAG_NUM_TASKS, 1);
        testConf.setBoolean(TestProcessor.getVertexConfName(TestProcessor.TEZ_FAILING_PROCESSOR_DO_FAIL, "v1"),
                true);
        testConf.set(TestProcessor.getVertexConfName(TestProcessor.TEZ_FAILING_PROCESSOR_FAILING_TASK_INDEX, "v1"),
                "0");
        testConf.setInt(TestProcessor
                .getVertexConfName(TestProcessor.TEZ_FAILING_PROCESSOR_FAILING_UPTO_TASK_ATTEMPT, "v1"), 0);

        StepCheck[] check = { createStep("v1 : 000000_0", CriticalPathDependency.INIT_DEPENDENCY),
                createStep("v1 : 000000_1", CriticalPathDependency.RETRY_DEPENDENCY),
                createStep("v2 : 000000_0", CriticalPathDependency.DATA_DEPENDENCY), };
        DAG dag = SimpleTestDAG.createDAG("testBasicTaskFailure", testConf);
        runDAG(dag, DAGStatus.State.SUCCEEDED);
        return Collections.singletonList(check);
    }

    private List<StepCheck[]> testTaskMultipleFailures() throws Exception {
        Configuration testConf = new Configuration(false);
        testConf.setInt(SimpleTestDAG.TEZ_SIMPLE_DAG_NUM_TASKS, 1);
        testConf.setBoolean(TestProcessor.getVertexConfName(TestProcessor.TEZ_FAILING_PROCESSOR_DO_FAIL, "v1"),
                true);
        testConf.set(TestProcessor.getVertexConfName(TestProcessor.TEZ_FAILING_PROCESSOR_FAILING_TASK_INDEX, "v1"),
                "0");
        testConf.setInt(TestProcessor
                .getVertexConfName(TestProcessor.TEZ_FAILING_PROCESSOR_FAILING_UPTO_TASK_ATTEMPT, "v1"), 1);

        StepCheck[] check = { createStep("v1 : 000000_0", CriticalPathDependency.INIT_DEPENDENCY),
                createStep("v1 : 000000_1", CriticalPathDependency.RETRY_DEPENDENCY),
                createStep("v1 : 000000_2", CriticalPathDependency.RETRY_DEPENDENCY),
                createStep("v2 : 000000_0", CriticalPathDependency.DATA_DEPENDENCY), };

        DAG dag = SimpleTestDAG.createDAG("testTaskMultipleFailures", testConf);
        runDAG(dag, DAGStatus.State.SUCCEEDED);
        return Collections.singletonList(check);
    }

    private List<StepCheck[]> testBasicInputFailureWithExit() throws Exception {
        Configuration testConf = new Configuration(false);
        testConf.setInt(SimpleTestDAG.TEZ_SIMPLE_DAG_NUM_TASKS, 1);
        testConf.setBoolean(TestInput.getVertexConfName(TestInput.TEZ_FAILING_INPUT_DO_FAIL, "v2"), true);
        testConf.setBoolean(TestInput.getVertexConfName(TestInput.TEZ_FAILING_INPUT_DO_FAIL_AND_EXIT, "v2"), true);
        testConf.set(TestInput.getVertexConfName(TestInput.TEZ_FAILING_INPUT_FAILING_TASK_INDEX, "v2"), "0");
        testConf.set(TestInput.getVertexConfName(TestInput.TEZ_FAILING_INPUT_FAILING_TASK_ATTEMPT, "v2"), "0");
        testConf.set(TestInput.getVertexConfName(TestInput.TEZ_FAILING_INPUT_FAILING_INPUT_INDEX, "v2"), "0");

        StepCheck[] check = { createStep("v1 : 000000_0", CriticalPathDependency.INIT_DEPENDENCY),
                createStep("v2 : 000000_0", CriticalPathDependency.DATA_DEPENDENCY),
                createStep("v1 : 000000_1", CriticalPathDependency.OUTPUT_RECREATE_DEPENDENCY),
                createStep("v2 : 000000_1", CriticalPathDependency.DATA_DEPENDENCY), };

        DAG dag = SimpleTestDAG.createDAG("testBasicInputFailureWithExit", testConf);
        runDAG(dag, DAGStatus.State.SUCCEEDED);
        return Collections.singletonList(check);
    }

    private List<StepCheck[]> testBasicInputFailureWithoutExit() throws Exception {
        Configuration testConf = new Configuration(false);
        testConf.setInt(SimpleTestDAG.TEZ_SIMPLE_DAG_NUM_TASKS, 1);
        testConf.setBoolean(TestInput.getVertexConfName(TestInput.TEZ_FAILING_INPUT_DO_FAIL, "v2"), true);
        testConf.set(TestInput.getVertexConfName(TestInput.TEZ_FAILING_INPUT_FAILING_TASK_INDEX, "v2"), "0");
        testConf.set(TestInput.getVertexConfName(TestInput.TEZ_FAILING_INPUT_FAILING_TASK_ATTEMPT, "v2"), "0");
        testConf.set(TestInput.getVertexConfName(TestInput.TEZ_FAILING_INPUT_FAILING_INPUT_INDEX, "v2"), "0");

        StepCheck[] check = { createStep("v1 : 000000_0", CriticalPathDependency.INIT_DEPENDENCY),
                createStep("v2 : 000000_0", CriticalPathDependency.DATA_DEPENDENCY),
                createStep("v1 : 000000_1", CriticalPathDependency.OUTPUT_RECREATE_DEPENDENCY),
                createStep("v2 : 000000_0", CriticalPathDependency.DATA_DEPENDENCY), };

        DAG dag = SimpleTestDAG.createDAG("testBasicInputFailureWithoutExit", testConf);
        runDAG(dag, DAGStatus.State.SUCCEEDED);
        return Collections.singletonList(check);
    }

    private List<StepCheck[]> testMultiVersionInputFailureWithExit() throws Exception {
        Configuration testConf = new Configuration(false);
        testConf.setInt(SimpleTestDAG.TEZ_SIMPLE_DAG_NUM_TASKS, 1);
        testConf.setBoolean(TestInput.getVertexConfName(TestInput.TEZ_FAILING_INPUT_DO_FAIL, "v2"), true);
        testConf.setBoolean(TestInput.getVertexConfName(TestInput.TEZ_FAILING_INPUT_DO_FAIL_AND_EXIT, "v2"), true);
        testConf.set(TestInput.getVertexConfName(TestInput.TEZ_FAILING_INPUT_FAILING_TASK_INDEX, "v2"), "0");
        testConf.set(TestInput.getVertexConfName(TestInput.TEZ_FAILING_INPUT_FAILING_TASK_ATTEMPT, "v2"), "0,1");
        testConf.set(TestInput.getVertexConfName(TestInput.TEZ_FAILING_INPUT_FAILING_INPUT_INDEX, "v2"), "0");
        testConf.setInt(TestInput.getVertexConfName(TestInput.TEZ_FAILING_INPUT_FAILING_UPTO_INPUT_ATTEMPT, "v2"),
                1);

        StepCheck[] check = { createStep("v1 : 000000_0", CriticalPathDependency.INIT_DEPENDENCY),
                createStep("v2 : 000000_0", CriticalPathDependency.DATA_DEPENDENCY),
                createStep("v1 : 000000_1", CriticalPathDependency.OUTPUT_RECREATE_DEPENDENCY),
                createStep("v2 : 000000_1", CriticalPathDependency.DATA_DEPENDENCY),
                createStep("v1 : 000000_2", CriticalPathDependency.OUTPUT_RECREATE_DEPENDENCY),
                createStep("v2 : 000000_2", CriticalPathDependency.DATA_DEPENDENCY), };

        DAG dag = SimpleTestDAG.createDAG("testMultiVersionInputFailureWithExit", testConf);
        runDAG(dag, DAGStatus.State.SUCCEEDED);
        return Collections.singletonList(check);
    }

    private List<StepCheck[]> testMultiVersionInputFailureWithoutExit() throws Exception {
        Configuration testConf = new Configuration(false);
        testConf.setInt(SimpleTestDAG.TEZ_SIMPLE_DAG_NUM_TASKS, 1);
        testConf.setBoolean(TestInput.getVertexConfName(TestInput.TEZ_FAILING_INPUT_DO_FAIL, "v2"), true);
        testConf.set(TestInput.getVertexConfName(TestInput.TEZ_FAILING_INPUT_FAILING_TASK_INDEX, "v2"), "0");
        testConf.set(TestInput.getVertexConfName(TestInput.TEZ_FAILING_INPUT_FAILING_TASK_ATTEMPT, "v2"), "0");
        testConf.set(TestInput.getVertexConfName(TestInput.TEZ_FAILING_INPUT_FAILING_INPUT_INDEX, "v2"), "0");
        testConf.setInt(TestInput.getVertexConfName(TestInput.TEZ_FAILING_INPUT_FAILING_UPTO_INPUT_ATTEMPT, "v2"),
                1);

        StepCheck[] check = { createStep("v1 : 000000_0", CriticalPathDependency.INIT_DEPENDENCY),
                createStep("v2 : 000000_0", CriticalPathDependency.DATA_DEPENDENCY),
                createStep("v1 : 000000_1", CriticalPathDependency.OUTPUT_RECREATE_DEPENDENCY),
                createStep("v2 : 000000_0", CriticalPathDependency.DATA_DEPENDENCY),
                createStep("v1 : 000000_2", CriticalPathDependency.OUTPUT_RECREATE_DEPENDENCY),
                createStep("v2 : 000000_0", CriticalPathDependency.DATA_DEPENDENCY), };

        DAG dag = SimpleTestDAG.createDAG("testMultiVersionInputFailureWithoutExit", testConf);
        runDAG(dag, DAGStatus.State.SUCCEEDED);
        return Collections.singletonList(check);
    }

    /**
     * Sets configuration for cascading input failure tests that
     * use SimpleTestDAG3Vertices.
     * @param testConf configuration
     * @param failAndExit whether input failure should trigger attempt exit 
     */
    private void setCascadingInputFailureConfig(Configuration testConf, boolean failAndExit, int numTasks) {
        // v2 attempt0 succeeds.
        // v2 all tasks attempt1 input0 fail up to version 0.
        testConf.setInt(SimpleTestDAG3Vertices.TEZ_SIMPLE_DAG_NUM_TASKS, numTasks);
        testConf.setBoolean(TestInput.getVertexConfName(TestInput.TEZ_FAILING_INPUT_DO_FAIL, "v2"), true);
        testConf.setBoolean(TestInput.getVertexConfName(TestInput.TEZ_FAILING_INPUT_DO_FAIL_AND_EXIT, "v2"),
                failAndExit);
        testConf.set(TestInput.getVertexConfName(TestInput.TEZ_FAILING_INPUT_FAILING_TASK_INDEX, "v2"), "-1");
        testConf.set(TestInput.getVertexConfName(TestInput.TEZ_FAILING_INPUT_FAILING_TASK_ATTEMPT, "v2"), "1");
        testConf.set(TestInput.getVertexConfName(TestInput.TEZ_FAILING_INPUT_FAILING_INPUT_INDEX, "v2"), "0");
        testConf.setInt(TestInput.getVertexConfName(TestInput.TEZ_FAILING_INPUT_FAILING_UPTO_INPUT_ATTEMPT, "v2"),
                0);

        //v3 task0 attempt0 all inputs fails up to version 0.
        testConf.setBoolean(TestInput.getVertexConfName(TestInput.TEZ_FAILING_INPUT_DO_FAIL, "v3"), true);
        testConf.setBoolean(TestInput.getVertexConfName(TestInput.TEZ_FAILING_INPUT_DO_FAIL_AND_EXIT, "v3"),
                failAndExit);
        testConf.set(TestInput.getVertexConfName(TestInput.TEZ_FAILING_INPUT_FAILING_TASK_INDEX, "v3"), "0");
        testConf.set(TestInput.getVertexConfName(TestInput.TEZ_FAILING_INPUT_FAILING_TASK_ATTEMPT, "v3"), "0");
        testConf.set(TestInput.getVertexConfName(TestInput.TEZ_FAILING_INPUT_FAILING_INPUT_INDEX, "v3"), "-1");
        testConf.setInt(TestInput.getVertexConfName(TestInput.TEZ_FAILING_INPUT_FAILING_UPTO_INPUT_ATTEMPT, "v3"),
                0);
    }

    /**
     * Test cascading input failure without exit. Expecting success.
     * v1 -- v2 -- v3
     * v3 all-tasks attempt0 input0 fails. Wait. Triggering v2 rerun.
     * v2 task0 attempt1 input0 fails. Wait. Triggering v1 rerun.
     * v1 attempt1 rerun and succeeds. v2 accepts v1 attempt1 output. v2 attempt1 succeeds.
     * v3 attempt0 accepts v2 attempt1 output.
     * 
     * AM vertex succeeded order is v1, v2, v1, v2, v3.
     * @throws Exception
     */
    private List<StepCheck[]> testCascadingInputFailureWithoutExitSuccess() throws Exception {
        Configuration testConf = new Configuration(false);
        setCascadingInputFailureConfig(testConf, false, 1);

        StepCheck[] check = { createStep("v1 : 000000_0", CriticalPathDependency.INIT_DEPENDENCY),
                createStep("v2 : 000000_0", CriticalPathDependency.DATA_DEPENDENCY),
                createStep("v3 : 000000_0", CriticalPathDependency.DATA_DEPENDENCY),
                createStep("v2 : 000000_1", CriticalPathDependency.OUTPUT_RECREATE_DEPENDENCY),
                createStep("v1 : 000000_1", CriticalPathDependency.OUTPUT_RECREATE_DEPENDENCY),
                createStep("v2 : 000000_1", CriticalPathDependency.DATA_DEPENDENCY),
                createStep("v3 : 000000_0", CriticalPathDependency.DATA_DEPENDENCY), };

        DAG dag = SimpleTestDAG3Vertices.createDAG("testCascadingInputFailureWithoutExitSuccess", testConf);
        runDAG(dag, DAGStatus.State.SUCCEEDED);
        return Collections.singletonList(check);
    }

    /**
     * Test cascading input failure with exit. Expecting success.
     * v1 -- v2 -- v3
     * v3 all-tasks attempt0 input0 fails. v3 attempt0 exits. Triggering v2 rerun.
     * v2 task0 attempt1 input0 fails. v2 attempt1 exits. Triggering v1 rerun.
     * v1 attempt1 rerun and succeeds. v2 accepts v1 attempt1 output. v2 attempt2 succeeds.
     * v3 attempt1 accepts v2 attempt2 output.
     * 
     * AM vertex succeeded order is v1, v2, v3, v1, v2, v3.
     * @throws Exception
     */
    private List<StepCheck[]> testCascadingInputFailureWithExitSuccess() throws Exception {
        Configuration testConf = new Configuration(false);
        setCascadingInputFailureConfig(testConf, true, 1);

        StepCheck[] check = { createStep("v1 : 000000_0", CriticalPathDependency.INIT_DEPENDENCY),
                createStep("v2 : 000000_0", CriticalPathDependency.DATA_DEPENDENCY),
                createStep("v3 : 000000_0", CriticalPathDependency.DATA_DEPENDENCY),
                createStep("v2 : 000000_1", CriticalPathDependency.OUTPUT_RECREATE_DEPENDENCY),
                createStep("v1 : 000000_1", CriticalPathDependency.OUTPUT_RECREATE_DEPENDENCY),
                createStep("v2 : 000000_2", CriticalPathDependency.DATA_DEPENDENCY),
                createStep("v3 : 000000_1", CriticalPathDependency.DATA_DEPENDENCY), };

        DAG dag = SimpleTestDAG3Vertices.createDAG("testCascadingInputFailureWithExitSuccess", testConf);
        runDAG(dag, DAGStatus.State.SUCCEEDED);
        return Collections.singletonList(check);
    }

    /**
     * 1 NM is running and can run 4 containers based on YARN mini cluster defaults and 
     * Tez defaults for AM/task memory
     * v3 task0 reports read errors against both tasks of v2. This re-starts both of them.
     * Now all 4 slots are occupied 1 AM + 3 tasks
     * Now retries of v2 report read error against 1 task of v1. That re-starts.
     * Retry of v1 task has no space - so it preempts the least priority task (current tez logic)
     * v3 is preempted and re-run. Shows up on critical path as preempted failure.
     * Also v1 retry attempts note show that it caused preemption of v3
     * @throws Exception
     */
    private List<StepCheck[]> testInternalPreemption() throws Exception {
        Configuration testConf = new Configuration(false);
        setCascadingInputFailureConfig(testConf, false, 2);

        StepCheck[] check = { createStep("v1 : 00000[01]_0", CriticalPathDependency.INIT_DEPENDENCY),
                createStep("v2 : 00000[01]_0", CriticalPathDependency.DATA_DEPENDENCY),
                createStep("v3 : 000000_0", CriticalPathDependency.DATA_DEPENDENCY,
                        TaskAttemptTerminationCause.INTERNAL_PREEMPTION, null),
                createStep("v2 : 00000[01]_1", CriticalPathDependency.OUTPUT_RECREATE_DEPENDENCY),
                createStep("v1 : 000000_1", CriticalPathDependency.OUTPUT_RECREATE_DEPENDENCY, null,
                        Collections.singletonList("preemption of v3")),
                createStep("v2 : 00000[01]_1", CriticalPathDependency.DATA_DEPENDENCY),
                createStep("v3 : 000000_1", CriticalPathDependency.DATA_DEPENDENCY) };

        DAG dag = SimpleTestDAG3Vertices.createDAG("testInternalPreemption", testConf);
        runDAG(dag, DAGStatus.State.SUCCEEDED);
        return Collections.singletonList(check);
    }

    /**
     * Input failure of v3 causes rerun of both both v1 and v2 vertices. 
     *   v1  v2
     *    \ /
     *    v3
     * 
     * @throws Exception
     */
    private List<StepCheck[]> testInputFailureCausesRerunOfTwoVerticesWithoutExit() throws Exception {
        Configuration testConf = new Configuration(false);
        testConf.setInt(SimpleVTestDAG.TEZ_SIMPLE_V_DAG_NUM_TASKS, 1);
        testConf.setBoolean(TestInput.getVertexConfName(TestInput.TEZ_FAILING_INPUT_DO_FAIL, "v3"), true);
        testConf.setBoolean(TestInput.getVertexConfName(TestInput.TEZ_FAILING_INPUT_DO_FAIL_AND_EXIT, "v3"), false);
        testConf.set(TestInput.getVertexConfName(TestInput.TEZ_FAILING_INPUT_FAILING_TASK_INDEX, "v3"), "0");
        testConf.set(TestInput.getVertexConfName(TestInput.TEZ_FAILING_INPUT_FAILING_TASK_ATTEMPT, "v3"), "0");
        testConf.set(TestInput.getVertexConfName(TestInput.TEZ_FAILING_INPUT_FAILING_INPUT_INDEX, "v3"), "-1");
        testConf.set(TestInput.getVertexConfName(TestInput.TEZ_FAILING_INPUT_FAILING_UPTO_INPUT_ATTEMPT, "v3"),
                "1");

        StepCheck[] check = {
                // use regex for either vertices being possible on the path
                createStep("v[12] : 000000_0", CriticalPathDependency.INIT_DEPENDENCY),
                createStep("v3 : 000000_0", CriticalPathDependency.DATA_DEPENDENCY),
                createStep("v[12] : 000000_[01]", CriticalPathDependency.OUTPUT_RECREATE_DEPENDENCY),
                createStep("v3 : 000000_0", CriticalPathDependency.DATA_DEPENDENCY),
                createStep("v[12] : 000000_[012]", CriticalPathDependency.OUTPUT_RECREATE_DEPENDENCY),
                createStep("v3 : 000000_0", CriticalPathDependency.DATA_DEPENDENCY),
                createStep("v[12] : 000000_[12]", CriticalPathDependency.OUTPUT_RECREATE_DEPENDENCY),
                createStep("v3 : 000000_0", CriticalPathDependency.DATA_DEPENDENCY),
                createStep("v[12] : 000000_2", CriticalPathDependency.OUTPUT_RECREATE_DEPENDENCY),
                createStep("v3 : 000000_0", CriticalPathDependency.DATA_DEPENDENCY), };

        DAG dag = SimpleVTestDAG.createDAG("testInputFailureCausesRerunOfTwoVerticesWithoutExit", testConf);
        runDAG(dag, DAGStatus.State.SUCCEEDED);
        return Collections.singletonList(check);
    }

    /**
     * Downstream(v3) attempt failure of a vertex connected with 
     * 2 upstream vertices.. 
     *   v1  v2
     *    \ /
     *    v3
     * 
     * @throws Exception
     */
    private List<StepCheck[]> testAttemptOfDownstreamVertexConnectedWithTwoUpstreamVerticesFailure()
            throws Exception {
        Configuration testConf = new Configuration(false);
        testConf.setInt(SimpleVTestDAG.TEZ_SIMPLE_V_DAG_NUM_TASKS, 1);
        testConf.setBoolean(TestProcessor.getVertexConfName(TestProcessor.TEZ_FAILING_PROCESSOR_DO_FAIL, "v3"),
                true);
        testConf.set(TestProcessor.getVertexConfName(TestProcessor.TEZ_FAILING_PROCESSOR_FAILING_TASK_INDEX, "v3"),
                "0");
        testConf.setInt(TestProcessor
                .getVertexConfName(TestProcessor.TEZ_FAILING_PROCESSOR_FAILING_UPTO_TASK_ATTEMPT, "v3"), 1);

        StepCheck[] check = {
                // use regex for either vertices being possible on the path
                createStep("v[12] : 000000_0", CriticalPathDependency.INIT_DEPENDENCY),
                createStep("v3 : 000000_0", CriticalPathDependency.DATA_DEPENDENCY),
                createStep("v3 : 000000_1", CriticalPathDependency.RETRY_DEPENDENCY),
                createStep("v3 : 000000_2", CriticalPathDependency.RETRY_DEPENDENCY), };

        DAG dag = SimpleVTestDAG.createDAG("testAttemptOfDownstreamVertexConnectedWithTwoUpstreamVerticesFailure",
                testConf);
        runDAG(dag, DAGStatus.State.SUCCEEDED);
        return Collections.singletonList(check);
    }

    /**
     * Input failure of v2,v3 trigger v1 rerun.
     * Both v2 and v3 report error on v1 and dont exit. So one of them triggers next
     * version of v1 and also consume the output of the next version. While the other
     * consumes the output of the next version of v1. 
     * Reruns can send output to 2 downstream vertices. 
     *     v1
     *    /  \
     *   v2   v3 
     * 
     * Also covers multiple consumer vertices report failure against same producer task.
     * @throws Exception
     */
    private List<StepCheck[]> testInputFailureRerunCanSendOutputToTwoDownstreamVertices() throws Exception {
        Configuration testConf = new Configuration(false);
        testConf.setInt(SimpleReverseVTestDAG.TEZ_SIMPLE_REVERSE_V_DAG_NUM_TASKS, 1);
        testConf.setBoolean(TestInput.getVertexConfName(TestInput.TEZ_FAILING_INPUT_DO_FAIL, "v2"), true);
        testConf.setBoolean(TestInput.getVertexConfName(TestInput.TEZ_FAILING_INPUT_DO_FAIL_AND_EXIT, "v2"), false);
        testConf.set(TestInput.getVertexConfName(TestInput.TEZ_FAILING_INPUT_FAILING_TASK_INDEX, "v2"), "-1");
        testConf.set(TestInput.getVertexConfName(TestInput.TEZ_FAILING_INPUT_FAILING_TASK_ATTEMPT, "v2"), "0");
        testConf.set(TestInput.getVertexConfName(TestInput.TEZ_FAILING_INPUT_FAILING_INPUT_INDEX, "v2"), "-1");
        testConf.set(TestInput.getVertexConfName(TestInput.TEZ_FAILING_INPUT_FAILING_UPTO_INPUT_ATTEMPT, "v2"),
                "0");

        testConf.setBoolean(TestInput.getVertexConfName(TestInput.TEZ_FAILING_INPUT_DO_FAIL, "v3"), true);
        testConf.setBoolean(TestInput.getVertexConfName(TestInput.TEZ_FAILING_INPUT_DO_FAIL_AND_EXIT, "v3"), false);
        testConf.set(TestInput.getVertexConfName(TestInput.TEZ_FAILING_INPUT_FAILING_TASK_INDEX, "v3"), "-1");
        testConf.set(TestInput.getVertexConfName(TestInput.TEZ_FAILING_INPUT_FAILING_TASK_ATTEMPT, "v3"), "0");
        testConf.set(TestInput.getVertexConfName(TestInput.TEZ_FAILING_INPUT_FAILING_INPUT_INDEX, "v3"), "-1");
        testConf.set(TestInput.getVertexConfName(TestInput.TEZ_FAILING_INPUT_FAILING_UPTO_INPUT_ATTEMPT, "v3"),
                "0");

        StepCheck[] check = {
                // use regex for either vertices being possible on the path
                createStep("v1 : 000000_0", CriticalPathDependency.INIT_DEPENDENCY),
                createStep("v[23] : 000000_0", CriticalPathDependency.DATA_DEPENDENCY),
                createStep("v1 : 000000_1", CriticalPathDependency.OUTPUT_RECREATE_DEPENDENCY),
                createStep("v[23] : 000000_0", CriticalPathDependency.DATA_DEPENDENCY), };
        DAG dag = SimpleReverseVTestDAG.createDAG("testInputFailureRerunCanSendOutputToTwoDownstreamVertices",
                testConf);
        runDAG(dag, DAGStatus.State.SUCCEEDED);
        return Collections.singletonList(check);
    }

}