Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.tez.dag.app; import java.io.EOFException; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.yarn.api.records.LocalResource; import org.apache.tez.common.TezCommonUtils; import org.apache.tez.dag.api.TezConfiguration; import org.apache.tez.dag.api.TezConstants; import org.apache.tez.dag.app.dag.DAGState; import org.apache.tez.dag.app.dag.Task; import org.apache.tez.dag.app.dag.Vertex; import org.apache.tez.dag.app.dag.impl.DAGImpl; import org.apache.tez.dag.history.HistoryEvent; import org.apache.tez.dag.history.HistoryEventType; import org.apache.tez.dag.history.events.AMLaunchedEvent; import org.apache.tez.dag.history.events.AMStartedEvent; import org.apache.tez.dag.history.events.ContainerLaunchedEvent; import org.apache.tez.dag.history.events.ContainerStoppedEvent; import org.apache.tez.dag.history.events.DAGCommitStartedEvent; import org.apache.tez.dag.history.events.DAGFinishedEvent; import org.apache.tez.dag.history.events.DAGInitializedEvent; import org.apache.tez.dag.history.events.DAGStartedEvent; import org.apache.tez.dag.history.events.DAGSubmittedEvent; import org.apache.tez.dag.history.events.TaskAttemptFinishedEvent; import org.apache.tez.dag.history.events.TaskAttemptStartedEvent; import org.apache.tez.dag.history.events.TaskFinishedEvent; import org.apache.tez.dag.history.events.TaskStartedEvent; import org.apache.tez.dag.history.events.VertexCommitStartedEvent; import org.apache.tez.dag.history.events.VertexRecoverableEventsGeneratedEvent; import org.apache.tez.dag.history.events.VertexFinishedEvent; import org.apache.tez.dag.history.events.VertexGroupCommitFinishedEvent; import org.apache.tez.dag.history.events.VertexGroupCommitStartedEvent; import org.apache.tez.dag.history.events.VertexInitializedEvent; import org.apache.tez.dag.history.events.VertexParallelismUpdatedEvent; import org.apache.tez.dag.history.events.VertexStartedEvent; import org.apache.tez.dag.history.recovery.RecoveryService; import org.apache.tez.dag.records.TezDAGID; import org.apache.tez.dag.records.TezVertexID; import org.apache.tez.dag.recovery.records.RecoveryProtos; import org.apache.tez.dag.recovery.records.RecoveryProtos.SummaryEventProto; import com.google.common.annotations.VisibleForTesting; public class RecoveryParser { private static final Log LOG = LogFactory.getLog(RecoveryParser.class); private final DAGAppMaster dagAppMaster; private final FileSystem recoveryFS; private final Path recoveryDataDir; private final Path currentAttemptRecoveryDataDir; private final int recoveryBufferSize; private final int currentAttemptId; private static final String dataRecoveredFileFlag = "dataRecovered"; public RecoveryParser(DAGAppMaster dagAppMaster, FileSystem recoveryFS, Path recoveryDataDir, int currentAttemptId) throws IOException { this.dagAppMaster = dagAppMaster; this.recoveryFS = recoveryFS; this.recoveryDataDir = recoveryDataDir; this.currentAttemptId = currentAttemptId; this.currentAttemptRecoveryDataDir = TezCommonUtils.getAttemptRecoveryPath(recoveryDataDir, currentAttemptId); recoveryBufferSize = dagAppMaster.getConfig().getInt(TezConfiguration.DAG_RECOVERY_FILE_IO_BUFFER_SIZE, TezConfiguration.DAG_RECOVERY_FILE_IO_BUFFER_SIZE_DEFAULT); this.recoveryFS.mkdirs(currentAttemptRecoveryDataDir); } public static class RecoveredDAGData { public TezDAGID recoveredDagID = null; public DAGImpl recoveredDAG = null; public DAGState dagState = null; public boolean isCompleted = false; public boolean nonRecoverable = false; public String reason = null; public Map<String, LocalResource> cumulativeAdditionalResources = null; } private static void parseSummaryFile(FSDataInputStream inputStream) throws IOException { while (true) { RecoveryProtos.SummaryEventProto proto = RecoveryProtos.SummaryEventProto .parseDelimitedFrom(inputStream); if (proto == null) { LOG.info("Reached end of summary stream"); break; } LOG.info("[SUMMARY]" + " dagId=" + proto.getDagId() + ", timestamp=" + proto.getTimestamp() + ", event=" + HistoryEventType.values()[proto.getEventType()]); } } private static HistoryEvent getNextEvent(FSDataInputStream inputStream) throws IOException { int eventTypeOrdinal = -1; try { eventTypeOrdinal = inputStream.readInt(); } catch (EOFException eof) { return null; } if (eventTypeOrdinal < 0 || eventTypeOrdinal >= HistoryEventType.values().length) { // Corrupt data // reached end throw new IOException("Corrupt data found when trying to read next event type" + ", eventTypeOrdinal=" + eventTypeOrdinal); } HistoryEventType eventType = HistoryEventType.values()[eventTypeOrdinal]; HistoryEvent event; switch (eventType) { case AM_LAUNCHED: event = new AMLaunchedEvent(); break; case AM_STARTED: event = new AMStartedEvent(); break; case DAG_SUBMITTED: event = new DAGSubmittedEvent(); break; case DAG_INITIALIZED: event = new DAGInitializedEvent(); break; case DAG_STARTED: event = new DAGStartedEvent(); break; case DAG_COMMIT_STARTED: event = new DAGCommitStartedEvent(); break; case DAG_FINISHED: event = new DAGFinishedEvent(); break; case CONTAINER_LAUNCHED: event = new ContainerLaunchedEvent(); break; case CONTAINER_STOPPED: event = new ContainerStoppedEvent(); break; case VERTEX_INITIALIZED: event = new VertexInitializedEvent(); break; case VERTEX_STARTED: event = new VertexStartedEvent(); break; case VERTEX_PARALLELISM_UPDATED: event = new VertexParallelismUpdatedEvent(); break; case VERTEX_COMMIT_STARTED: event = new VertexCommitStartedEvent(); break; case VERTEX_GROUP_COMMIT_STARTED: event = new VertexGroupCommitStartedEvent(); break; case VERTEX_GROUP_COMMIT_FINISHED: event = new VertexGroupCommitFinishedEvent(); break; case VERTEX_FINISHED: event = new VertexFinishedEvent(); break; case TASK_STARTED: event = new TaskStartedEvent(); break; case TASK_FINISHED: event = new TaskFinishedEvent(); break; case TASK_ATTEMPT_STARTED: event = new TaskAttemptStartedEvent(); break; case TASK_ATTEMPT_FINISHED: event = new TaskAttemptFinishedEvent(); break; case VERTEX_DATA_MOVEMENT_EVENTS_GENERATED: event = new VertexRecoverableEventsGeneratedEvent(); break; default: throw new IOException("Invalid data found, unknown event type " + eventType); } if (LOG.isDebugEnabled()) { LOG.debug("Parsing event from input stream" + ", eventType=" + eventType); } try { event.fromProtoStream(inputStream); } catch (EOFException eof) { return null; } if (LOG.isDebugEnabled()) { LOG.debug( "Parsed event from input stream" + ", eventType=" + eventType + ", event=" + event.toString()); } return event; } public static List<HistoryEvent> parseDAGRecoveryFile(FSDataInputStream inputStream) throws IOException { List<HistoryEvent> historyEvents = new ArrayList<HistoryEvent>(); while (true) { HistoryEvent historyEvent = getNextEvent(inputStream); if (historyEvent == null) { LOG.info("Reached end of stream"); break; } historyEvents.add(historyEvent); } return historyEvents; } public static void main(String argv[]) throws IOException { // TODO clean up with better usage and error handling Configuration conf = new Configuration(); String summaryPath = argv[0]; List<String> dagPaths = new ArrayList<String>(); if (argv.length > 1) { for (int i = 1; i < argv.length; ++i) { dagPaths.add(argv[i]); } } FileSystem fs = FileSystem.get(conf); LOG.info("Parsing Summary file " + summaryPath); parseSummaryFile(fs.open(new Path(summaryPath))); for (String dagPath : dagPaths) { LOG.info("Parsing DAG recovery file " + dagPath); List<HistoryEvent> historyEvents = parseDAGRecoveryFile(fs.open(new Path(dagPath))); for (HistoryEvent historyEvent : historyEvents) { LOG.info("Parsed event from recovery stream" + ", eventType=" + historyEvent.getEventType() + ", event=" + historyEvent); } } } private Path getSummaryPath(Path attemptRrecoveryDataDir) { return TezCommonUtils.getSummaryRecoveryPath(attemptRrecoveryDataDir); } private FSDataOutputStream getSummaryOutputStream(Path summaryPath) throws IOException { return recoveryFS.create(summaryPath, true, recoveryBufferSize); } private FSDataInputStream getSummaryStream(Path summaryPath) throws IOException { if (!recoveryFS.exists(summaryPath)) { return null; } return recoveryFS.open(summaryPath, recoveryBufferSize); } private Path getDAGRecoveryFilePath(Path recoveryDataDir, TezDAGID dagID) { return new Path(recoveryDataDir, dagID.toString() + TezConstants.DAG_RECOVERY_RECOVER_FILE_SUFFIX); } private FSDataInputStream getDAGRecoveryStream(Path recoveryDataDir, TezDAGID dagID) throws IOException { Path dagRecoveryPath = getDAGRecoveryFilePath(recoveryDataDir, dagID); if (!recoveryFS.exists(dagRecoveryPath)) { return null; } return recoveryFS.open(dagRecoveryPath, recoveryBufferSize); } private FSDataOutputStream getDAGRecoveryOutputStream(Path recoveryDataDir, TezDAGID dagID) throws IOException { Path dagRecoveryPath = new Path(recoveryDataDir, dagID.toString() + TezConstants.DAG_RECOVERY_RECOVER_FILE_SUFFIX); return recoveryFS.create(dagRecoveryPath, true, recoveryBufferSize); } @VisibleForTesting DAGSummaryData getLastCompletedOrInProgressDAG(Map<TezDAGID, DAGSummaryData> dagSummaryDataMap) { DAGSummaryData inProgressDAG = null; DAGSummaryData lastCompletedDAG = null; for (Map.Entry<TezDAGID, DAGSummaryData> entry : dagSummaryDataMap.entrySet()) { if (!entry.getValue().completed) { if (inProgressDAG != null) { throw new RuntimeException("Multiple in progress DAGs seen" + ", dagId=" + inProgressDAG.dagId + ", dagId=" + entry.getKey()); } inProgressDAG = entry.getValue(); } else { if (lastCompletedDAG == null || lastCompletedDAG.dagId.getId() < entry.getValue().dagId.getId()) { lastCompletedDAG = entry.getValue(); } } } if (inProgressDAG == null) { return lastCompletedDAG; } return inProgressDAG; } private Path getPreviousAttemptRecoveryDataDir() throws IOException { LOG.info("Looking for the correct attempt directory to recover from"); int foundPreviousAttempt = -1; for (int i = currentAttemptId - 1; i > 0; --i) { Path attemptPath = TezCommonUtils.getAttemptRecoveryPath(recoveryDataDir, i); LOG.info("Looking at attempt directory, path=" + attemptPath); Path fatalErrorOccurred = new Path(attemptPath, RecoveryService.RECOVERY_FATAL_OCCURRED_DIR); if (recoveryFS.exists(fatalErrorOccurred)) { throw new IOException("Found that a fatal error occurred in" + " recovery during previous attempt, foundFile=" + fatalErrorOccurred.toString()); } Path dataRecoveredFile = new Path(attemptPath, dataRecoveredFileFlag); try { if (recoveryFS.exists(dataRecoveredFile)) { LOG.info("Found data recovered file in attempt directory" + ", dataRecoveredFile=" + dataRecoveredFile + ", path=" + attemptPath); foundPreviousAttempt = i; break; } LOG.info("Skipping attempt directory as data recovered file does not exist" + ", dataRecoveredFile=" + dataRecoveredFile + ", path=" + attemptPath); } catch (IOException e) { LOG.warn("Exception when checking previous attempt dir for " + dataRecoveredFile.toString(), e); } } if (foundPreviousAttempt == -1) { // Look for oldest summary file and use that LOG.info("Did not find any attempt dir that had data recovered file." + " Looking for oldest summary file"); for (int i = 1; i < currentAttemptId; ++i) { Path attemptPath = TezCommonUtils.getAttemptRecoveryPath(recoveryDataDir, i); Path summaryPath = getSummaryPath(attemptPath); if (recoveryFS.exists(summaryPath)) { LOG.info("Found summary file in attempt directory" + ", summaryFile=" + summaryPath + ", path=" + attemptPath); foundPreviousAttempt = i; break; } LOG.info("Skipping attempt directory as no summary file found" + ", summaryFile=" + summaryPath + ", path=" + attemptPath); } } if (foundPreviousAttempt == -1) { LOG.info("Falling back to first attempt as no other recovered attempts" + " found"); foundPreviousAttempt = 1; } return TezCommonUtils.getAttemptRecoveryPath(recoveryDataDir, foundPreviousAttempt); } @VisibleForTesting static class DAGSummaryData { final TezDAGID dagId; String dagName; boolean completed = false; boolean dagCommitCompleted = true; DAGState dagState; Map<TezVertexID, Boolean> vertexCommitStatus = new HashMap<TezVertexID, Boolean>(); Map<String, Boolean> vertexGroupCommitStatus = new HashMap<String, Boolean>(); List<HistoryEvent> bufferedSummaryEvents = new ArrayList<HistoryEvent>(); DAGSummaryData(TezDAGID dagId) { this.dagId = dagId; } void handleSummaryEvent(SummaryEventProto proto) throws IOException { HistoryEventType eventType = HistoryEventType.values()[proto.getEventType()]; switch (eventType) { case DAG_SUBMITTED: completed = false; DAGSubmittedEvent dagSubmittedEvent = new DAGSubmittedEvent(); dagSubmittedEvent.fromSummaryProtoStream(proto); dagName = dagSubmittedEvent.getDAGName(); break; case DAG_FINISHED: completed = true; dagCommitCompleted = true; DAGFinishedEvent dagFinishedEvent = new DAGFinishedEvent(); dagFinishedEvent.fromSummaryProtoStream(proto); dagState = dagFinishedEvent.getState(); break; case DAG_COMMIT_STARTED: dagCommitCompleted = false; break; case VERTEX_COMMIT_STARTED: VertexCommitStartedEvent vertexCommitStartedEvent = new VertexCommitStartedEvent(); vertexCommitStartedEvent.fromSummaryProtoStream(proto); vertexCommitStatus.put(vertexCommitStartedEvent.getVertexID(), false); break; case VERTEX_FINISHED: VertexFinishedEvent vertexFinishedEvent = new VertexFinishedEvent(); vertexFinishedEvent.fromSummaryProtoStream(proto); if (vertexCommitStatus.containsKey(vertexFinishedEvent.getVertexID())) { vertexCommitStatus.put(vertexFinishedEvent.getVertexID(), true); bufferedSummaryEvents.add(vertexFinishedEvent); } break; case VERTEX_GROUP_COMMIT_STARTED: VertexGroupCommitStartedEvent vertexGroupCommitStartedEvent = new VertexGroupCommitStartedEvent(); vertexGroupCommitStartedEvent.fromSummaryProtoStream(proto); bufferedSummaryEvents.add(vertexGroupCommitStartedEvent); vertexGroupCommitStatus.put(vertexGroupCommitStartedEvent.getVertexGroupName(), false); break; case VERTEX_GROUP_COMMIT_FINISHED: VertexGroupCommitFinishedEvent vertexGroupCommitFinishedEvent = new VertexGroupCommitFinishedEvent(); vertexGroupCommitFinishedEvent.fromSummaryProtoStream(proto); bufferedSummaryEvents.add(vertexGroupCommitFinishedEvent); vertexGroupCommitStatus.put(vertexGroupCommitFinishedEvent.getVertexGroupName(), true); break; default: String message = "Found invalid summary event that was not handled" + ", eventType=" + eventType.name(); throw new IOException(message); } } @Override public String toString() { StringBuilder sb = new StringBuilder(); sb.append("dagId=").append(dagId); sb.append(", dagCompleted=").append(completed); if (!vertexCommitStatus.isEmpty()) { sb.append(", vertexCommitStatuses=["); for (Entry<TezVertexID, Boolean> entry : vertexCommitStatus.entrySet()) { sb.append("{ vertexId=").append(entry.getKey()).append(", committed=").append(entry.getValue()) .append("}, "); } sb.append("]"); } if (!vertexGroupCommitStatus.isEmpty()) { sb.append(", vertexGroupCommitStatuses=["); for (Entry<String, Boolean> entry : vertexGroupCommitStatus.entrySet()) { sb.append("{ vertexGroup=").append(entry.getKey()).append(", committed=") .append(entry.getValue()).append("}, "); } sb.append("]"); } return sb.toString(); } } private String isDAGRecoverable(DAGSummaryData data) { if (!data.dagCommitCompleted) { return "DAG Commit was in progress, not recoverable" + ", dagId=" + data.dagId; } if (!data.vertexCommitStatus.isEmpty()) { for (Entry<TezVertexID, Boolean> entry : data.vertexCommitStatus.entrySet()) { if (!(entry.getValue().booleanValue())) { return "Vertex Commit was in progress, not recoverable" + ", dagId=" + data.dagId + ", vertexId=" + entry.getKey(); } } } if (!data.vertexGroupCommitStatus.isEmpty()) { for (Entry<String, Boolean> entry : data.vertexGroupCommitStatus.entrySet()) { if (!(entry.getValue().booleanValue())) { return "Vertex Group Commit was in progress, not recoverable" + ", dagId=" + data.dagId + ", vertexGroup=" + entry.getKey(); } } } return null; } public RecoveredDAGData parseRecoveryData() throws IOException { Path previousAttemptRecoveryDataDir = getPreviousAttemptRecoveryDataDir(); LOG.info("Using " + previousAttemptRecoveryDataDir.toString() + " for recovering data from previous attempt"); if (!recoveryFS.exists(previousAttemptRecoveryDataDir)) { LOG.info("Nothing to recover as previous attempt data does not exist" + ", previousAttemptDir=" + previousAttemptRecoveryDataDir.toString()); createDataRecoveredFlagFile(); return null; } Path summaryPath = getSummaryPath(previousAttemptRecoveryDataDir); FSDataInputStream summaryStream = getSummaryStream(summaryPath); if (summaryStream == null) { LOG.info("Nothing to recover as summary file does not exist" + ", previousAttemptDir=" + previousAttemptRecoveryDataDir.toString() + ", summaryPath=" + summaryPath.toString()); createDataRecoveredFlagFile(); return null; } Path newSummaryPath = getSummaryPath(currentAttemptRecoveryDataDir); FSDataOutputStream newSummaryStream = getSummaryOutputStream(newSummaryPath); FileStatus summaryFileStatus = recoveryFS.getFileStatus(summaryPath); LOG.info("Parsing summary file" + ", path=" + summaryPath.toString() + ", len=" + summaryFileStatus.getLen() + ", lastModTime=" + summaryFileStatus.getModificationTime()); int dagCounter = 0; Map<TezDAGID, DAGSummaryData> dagSummaryDataMap = new HashMap<TezDAGID, DAGSummaryData>(); while (true) { RecoveryProtos.SummaryEventProto proto; try { proto = RecoveryProtos.SummaryEventProto.parseDelimitedFrom(summaryStream); if (proto == null) { LOG.info("Reached end of summary stream"); break; } } catch (EOFException eof) { LOG.info("Reached end of summary stream"); break; } HistoryEventType eventType = HistoryEventType.values()[proto.getEventType()]; if (LOG.isDebugEnabled()) { LOG.debug("[RECOVERY SUMMARY]" + " dagId=" + proto.getDagId() + ", timestamp=" + proto.getTimestamp() + ", event=" + eventType); } TezDAGID dagId = TezDAGID.fromString(proto.getDagId()); if (dagCounter < dagId.getId()) { dagCounter = dagId.getId(); } if (!dagSummaryDataMap.containsKey(dagId)) { dagSummaryDataMap.put(dagId, new DAGSummaryData(dagId)); } dagSummaryDataMap.get(dagId).handleSummaryEvent(proto); proto.writeDelimitedTo(newSummaryStream); } summaryStream.close(); newSummaryStream.hsync(); newSummaryStream.close(); // Set counter for next set of DAGs & update dagNames Set in DAGAppMaster dagAppMaster.setDAGCounter(dagCounter); for (DAGSummaryData dagSummaryData : dagSummaryDataMap.values()) { dagAppMaster.dagNames.add(dagSummaryData.dagName); dagAppMaster.dagIDs.add(dagSummaryData.dagId.toString()); } DAGSummaryData lastInProgressDAGData = getLastCompletedOrInProgressDAG(dagSummaryDataMap); if (lastInProgressDAGData == null) { LOG.info("Nothing to recover as no uncompleted/completed DAGs found"); return null; } TezDAGID lastInProgressDAG = lastInProgressDAGData.dagId; if (lastInProgressDAG == null) { LOG.info("Nothing to recover as no uncompleted/completed DAGs found"); return null; } LOG.info("Checking if DAG is in recoverable state" + ", dagId=" + lastInProgressDAGData.dagId); final RecoveredDAGData recoveredDAGData = new RecoveredDAGData(); if (lastInProgressDAGData.completed) { recoveredDAGData.isCompleted = true; recoveredDAGData.dagState = lastInProgressDAGData.dagState; } String nonRecoverableReason = isDAGRecoverable(lastInProgressDAGData); if (nonRecoverableReason != null) { LOG.warn("Found last inProgress DAG but not recoverable: " + lastInProgressDAGData); recoveredDAGData.nonRecoverable = true; recoveredDAGData.reason = nonRecoverableReason; } LOG.info("Trying to recover dag from recovery file" + ", dagId=" + lastInProgressDAG.toString() + ", dataDir=" + previousAttemptRecoveryDataDir + ", intoCurrentDir=" + currentAttemptRecoveryDataDir); FSDataInputStream dagRecoveryStream = getDAGRecoveryStream(previousAttemptRecoveryDataDir, lastInProgressDAG); if (dagRecoveryStream == null) { // Could not find data to recover // Error out throw new IOException( "Could not find recovery data for last in progress DAG" + ", dagId=" + lastInProgressDAG); } LOG.info("Copying DAG data into Current Attempt directory" + ", filePath=" + getDAGRecoveryFilePath(currentAttemptRecoveryDataDir, lastInProgressDAG)); FSDataOutputStream newDAGRecoveryStream = getDAGRecoveryOutputStream(currentAttemptRecoveryDataDir, lastInProgressDAG); boolean skipAllOtherEvents = false; while (true) { HistoryEvent event; try { event = getNextEvent(dagRecoveryStream); if (event == null) { LOG.info("Reached end of dag recovery stream"); break; } } catch (EOFException eof) { LOG.info("Reached end of dag recovery stream"); break; } catch (IOException ioe) { LOG.warn("Corrupt data found when trying to read next event", ioe); break; } if (skipAllOtherEvents) { // hit an error - skip reading other events break; } HistoryEventType eventType = event.getEventType(); switch (eventType) { case DAG_SUBMITTED: { DAGSubmittedEvent submittedEvent = (DAGSubmittedEvent) event; LOG.info("Recovering from event" + ", eventType=" + eventType + ", event=" + event.toString()); recoveredDAGData.recoveredDAG = dagAppMaster.createDAG(submittedEvent.getDAGPlan(), lastInProgressDAG); recoveredDAGData.cumulativeAdditionalResources = submittedEvent .getCumulativeAdditionalLocalResources(); recoveredDAGData.recoveredDagID = recoveredDAGData.recoveredDAG.getID(); dagAppMaster.setCurrentDAG(recoveredDAGData.recoveredDAG); if (recoveredDAGData.nonRecoverable) { skipAllOtherEvents = true; } break; } case DAG_INITIALIZED: { LOG.info("Recovering from event" + ", eventType=" + eventType + ", event=" + event.toString()); assert recoveredDAGData.recoveredDAG != null; recoveredDAGData.recoveredDAG.restoreFromEvent(event); break; } case DAG_STARTED: { LOG.info("Recovering from event" + ", eventType=" + eventType + ", event=" + event.toString()); assert recoveredDAGData.recoveredDAG != null; recoveredDAGData.recoveredDAG.restoreFromEvent(event); break; } case DAG_COMMIT_STARTED: { LOG.info("Recovering from event" + ", eventType=" + eventType + ", event=" + event.toString()); assert recoveredDAGData.recoveredDAG != null; recoveredDAGData.recoveredDAG.restoreFromEvent(event); break; } case VERTEX_GROUP_COMMIT_STARTED: { LOG.info("Recovering from event" + ", eventType=" + eventType + ", event=" + event.toString()); assert recoveredDAGData.recoveredDAG != null; recoveredDAGData.recoveredDAG.restoreFromEvent(event); break; } case VERTEX_GROUP_COMMIT_FINISHED: { LOG.info("Recovering from event" + ", eventType=" + eventType + ", event=" + event.toString()); assert recoveredDAGData.recoveredDAG != null; recoveredDAGData.recoveredDAG.restoreFromEvent(event); break; } case DAG_FINISHED: { LOG.info("Recovering from event" + ", eventType=" + eventType + ", event=" + event.toString()); // If this is seen, nothing to recover assert recoveredDAGData.recoveredDAG != null; recoveredDAGData.recoveredDAG.restoreFromEvent(event); recoveredDAGData.isCompleted = true; recoveredDAGData.dagState = ((DAGFinishedEvent) event).getState(); skipAllOtherEvents = true; break; } case CONTAINER_LAUNCHED: { // Nothing to do for now break; } case VERTEX_INITIALIZED: { LOG.info("Recovering from event" + ", eventType=" + eventType + ", event=" + event.toString()); assert recoveredDAGData.recoveredDAG != null; VertexInitializedEvent vEvent = (VertexInitializedEvent) event; Vertex v = recoveredDAGData.recoveredDAG.getVertex(vEvent.getVertexID()); v.restoreFromEvent(vEvent); break; } case VERTEX_STARTED: { LOG.info("Recovering from event" + ", eventType=" + eventType + ", event=" + event.toString()); assert recoveredDAGData.recoveredDAG != null; VertexStartedEvent vEvent = (VertexStartedEvent) event; Vertex v = recoveredDAGData.recoveredDAG.getVertex(vEvent.getVertexID()); v.restoreFromEvent(vEvent); break; } case VERTEX_PARALLELISM_UPDATED: { LOG.info("Recovering from event" + ", eventType=" + eventType + ", event=" + event.toString()); assert recoveredDAGData.recoveredDAG != null; VertexParallelismUpdatedEvent vEvent = (VertexParallelismUpdatedEvent) event; Vertex v = recoveredDAGData.recoveredDAG.getVertex(vEvent.getVertexID()); v.restoreFromEvent(vEvent); break; } case VERTEX_COMMIT_STARTED: { LOG.info("Recovering from event" + ", eventType=" + eventType + ", event=" + event.toString()); assert recoveredDAGData.recoveredDAG != null; VertexCommitStartedEvent vEvent = (VertexCommitStartedEvent) event; Vertex v = recoveredDAGData.recoveredDAG.getVertex(vEvent.getVertexID()); v.restoreFromEvent(vEvent); break; } case VERTEX_FINISHED: { LOG.info("Recovering from event" + ", eventType=" + eventType + ", event=" + event.toString()); assert recoveredDAGData.recoveredDAG != null; VertexFinishedEvent vEvent = (VertexFinishedEvent) event; Vertex v = recoveredDAGData.recoveredDAG.getVertex(vEvent.getVertexID()); v.restoreFromEvent(vEvent); break; } case TASK_STARTED: { LOG.info("Recovering from event" + ", eventType=" + eventType + ", event=" + event.toString()); assert recoveredDAGData.recoveredDAG != null; TaskStartedEvent tEvent = (TaskStartedEvent) event; Task task = recoveredDAGData.recoveredDAG.getVertex(tEvent.getTaskID().getVertexID()) .getTask(tEvent.getTaskID()); task.restoreFromEvent(tEvent); break; } case TASK_FINISHED: { LOG.info("Recovering from event" + ", eventType=" + eventType + ", event=" + event.toString()); assert recoveredDAGData.recoveredDAG != null; TaskFinishedEvent tEvent = (TaskFinishedEvent) event; Task task = recoveredDAGData.recoveredDAG.getVertex(tEvent.getTaskID().getVertexID()) .getTask(tEvent.getTaskID()); task.restoreFromEvent(tEvent); break; } case TASK_ATTEMPT_STARTED: { LOG.info("Recovering from event" + ", eventType=" + eventType + ", event=" + event.toString()); assert recoveredDAGData.recoveredDAG != null; TaskAttemptStartedEvent tEvent = (TaskAttemptStartedEvent) event; Task task = recoveredDAGData.recoveredDAG .getVertex(tEvent.getTaskAttemptID().getTaskID().getVertexID()) .getTask(tEvent.getTaskAttemptID().getTaskID()); task.restoreFromEvent(tEvent); break; } case TASK_ATTEMPT_FINISHED: { LOG.info("Recovering from event" + ", eventType=" + eventType + ", event=" + event.toString()); assert recoveredDAGData.recoveredDAG != null; TaskAttemptFinishedEvent tEvent = (TaskAttemptFinishedEvent) event; Task task = recoveredDAGData.recoveredDAG .getVertex(tEvent.getTaskAttemptID().getTaskID().getVertexID()) .getTask(tEvent.getTaskAttemptID().getTaskID()); task.restoreFromEvent(tEvent); break; } case VERTEX_DATA_MOVEMENT_EVENTS_GENERATED: { LOG.info("Recovering from event" + ", eventType=" + eventType + ", event=" + event.toString()); assert recoveredDAGData.recoveredDAG != null; VertexRecoverableEventsGeneratedEvent vEvent = (VertexRecoverableEventsGeneratedEvent) event; Vertex v = recoveredDAGData.recoveredDAG.getVertex(vEvent.getVertexID()); v.restoreFromEvent(vEvent); break; } default: throw new RuntimeException("Invalid data found, unknown event type " + eventType); } if (LOG.isDebugEnabled()) { LOG.debug("[DAG RECOVERY]" + " dagId=" + lastInProgressDAG + ", eventType=" + eventType + ", event=" + event.toString()); } newDAGRecoveryStream.writeInt(eventType.ordinal()); event.toProtoStream(newDAGRecoveryStream); } dagRecoveryStream.close(); newDAGRecoveryStream.hsync(); newDAGRecoveryStream.close(); if (!recoveredDAGData.isCompleted && !recoveredDAGData.nonRecoverable) { if (lastInProgressDAGData.bufferedSummaryEvents != null && !lastInProgressDAGData.bufferedSummaryEvents.isEmpty()) { for (HistoryEvent bufferedEvent : lastInProgressDAGData.bufferedSummaryEvents) { assert recoveredDAGData.recoveredDAG != null; switch (bufferedEvent.getEventType()) { case VERTEX_GROUP_COMMIT_STARTED: recoveredDAGData.recoveredDAG.restoreFromEvent(bufferedEvent); break; case VERTEX_GROUP_COMMIT_FINISHED: recoveredDAGData.recoveredDAG.restoreFromEvent(bufferedEvent); break; case VERTEX_FINISHED: VertexFinishedEvent vertexFinishedEvent = (VertexFinishedEvent) bufferedEvent; Vertex vertex = recoveredDAGData.recoveredDAG.getVertex(vertexFinishedEvent.getVertexID()); if (vertex == null) { recoveredDAGData.nonRecoverable = true; recoveredDAGData.reason = "All state could not be recovered" + ", vertex completed but events not flushed" + ", vertexId=" + vertexFinishedEvent.getVertexID(); } else { vertex.restoreFromEvent(vertexFinishedEvent); } break; default: throw new RuntimeException("Invalid data found in buffered summary events" + ", unknown event type " + bufferedEvent.getEventType()); } } } } LOG.info("Finished copying data from previous attempt into current attempt"); createDataRecoveredFlagFile(); return recoveredDAGData; } private void createDataRecoveredFlagFile() throws IOException { Path dataCopiedFlagPath = new Path(currentAttemptRecoveryDataDir, dataRecoveredFileFlag); LOG.info("Trying to create data recovered flag file" + ", filePath=" + dataCopiedFlagPath.toString()); recoveryFS.mkdirs(dataCopiedFlagPath); } }