Selaa lähdekoodia

AMBARI-20593. EU/RU Auto-Retry does not reschedule task when host is not heartbeating before task is scheduled and doesn't have a start time (alejandro)

Alejandro Fernandez 8 vuotta sitten
vanhempi
commit
d67d8a08fe

+ 3 - 3
ambari-server/src/main/java/org/apache/ambari/server/actionmanager/HostRoleCommand.java

@@ -168,10 +168,10 @@ public class HostRoleCommand {
     errorLog = hostRoleCommandEntity.getErrorLog();
     structuredOut = hostRoleCommandEntity.getStructuredOut() != null ? new String(hostRoleCommandEntity.getStructuredOut()) : "";
     exitCode = hostRoleCommandEntity.getExitcode();
-    startTime = hostRoleCommandEntity.getStartTime();
-    originalStartTime = hostRoleCommandEntity.getOriginalStartTime();
+    startTime = hostRoleCommandEntity.getStartTime() != null ? hostRoleCommandEntity.getStartTime() : -1L;
+    originalStartTime = hostRoleCommandEntity.getOriginalStartTime() != null ? hostRoleCommandEntity.getOriginalStartTime() : -1L;
     endTime = hostRoleCommandEntity.getEndTime() != null ? hostRoleCommandEntity.getEndTime() : -1L;
-    lastAttemptTime = hostRoleCommandEntity.getLastAttemptTime();
+    lastAttemptTime = hostRoleCommandEntity.getLastAttemptTime() != null ? hostRoleCommandEntity.getLastAttemptTime() : -1L;
     attemptCount = hostRoleCommandEntity.getAttemptCount();
     retryAllowed = hostRoleCommandEntity.isRetryAllowed();
     autoSkipFailure = hostRoleCommandEntity.isFailureAutoSkipped();

+ 48 - 12
ambari-server/src/main/java/org/apache/ambari/server/state/services/RetryUpgradeActionService.java

@@ -199,24 +199,59 @@ public class RetryUpgradeActionService extends AbstractScheduledService {
     List<HostRoleCommandEntity> holdingCommands = m_hostRoleCommandDAO.findByRequestIdAndStatuses(requestId, HOLDING_STATUSES);
     if (holdingCommands.size() > 0) {
       for (HostRoleCommandEntity hrc : holdingCommands) {
-        LOG.debug("Comparing taskId: {}, original start time: {}, now: {}",
-            hrc.getTaskId(), hrc.getOriginalStartTime(), now);
+        LOG.debug("Comparing taskId: {}, attempt count: {}, original start time: {}, now: {}",
+            hrc.getTaskId(), hrc.getAttemptCount(), hrc.getOriginalStartTime(), now);
 
         /*
+        Use-Case 1:
+        If the command has been sent to the host before because it was heartbeating, then it does have
+        an original start time, so we can attempt to retry on this host even if no longer heartbeating.
+        If the host does heartbeat again within the time interval, the command will actually be scheduled by the host.
+
+        Use-Case 2:
+        If the host is not heartbeating and the command is scheduled to be ran on it, then it means the following
+        is true,
+        - does not have original start time
+        - does not have start time
+        - attempt count is 0
+        - status will be HOLDING_TIMEDOUT
+        When the host does start heartbeating, we need to schedule this command by changing its state back to PENDING.
+
+        Notes:
         While testing, can update the original_start_time of records in host_role_command table to current epoch time.
         E.g. in postgres,
         SELECT CAST(EXTRACT(EPOCH FROM NOW()) AS BIGINT) * 1000;
         UPDATE host_role_command SET attempt_count = 1, status = 'HOLDING_FAILED', original_start_time = (CAST(EXTRACT(EPOCH FROM NOW()) AS BIGINT) * 1000) WHERE task_id IN (x, y, z);
-         */
-        if (canRetryCommand(hrc) && hrc.getOriginalStartTime() > 0 && hrc.getOriginalStartTime() < now) {
-          Long retryTimeWindow = hrc.getOriginalStartTime() + MAX_TIMEOUT_MS;
-          Long deltaMS = retryTimeWindow - now;
-
-          if (deltaMS > 0) {
-            String originalStartTimeString = m_fullDateFormat.format(new Date(hrc.getOriginalStartTime()));
-            String deltaString = m_deltaDateFormat.format(new Date(deltaMS));
-            LOG.info("Retrying task with id: {}, attempts: {}, original start time: {}, time til timeout: {}",
-                hrc.getTaskId(), hrc.getAttemptCount(), originalStartTimeString, deltaString);
+        */
+
+        if (canRetryCommand(hrc)) {
+
+          boolean allowRetry = false;
+          // Use-Case 1
+          if (hrc.getOriginalStartTime() != null && hrc.getOriginalStartTime() > 0 && hrc.getOriginalStartTime() < now) {
+            Long retryTimeWindow = hrc.getOriginalStartTime() + MAX_TIMEOUT_MS;
+            Long deltaMS = retryTimeWindow - now;
+
+            if (deltaMS > 0) {
+              String originalStartTimeString = m_fullDateFormat.format(new Date(hrc.getOriginalStartTime()));
+              String deltaString = m_deltaDateFormat.format(new Date(deltaMS));
+              LOG.info("Retrying task with id: {}, attempts: {}, original start time: {}, time til timeout: {}",
+                  hrc.getTaskId(), hrc.getAttemptCount(), originalStartTimeString, deltaString);
+              allowRetry = true;
+            }
+          }
+
+          // Use-Case 2
+          if ((hrc.getOriginalStartTime() == null || hrc.getOriginalStartTime() == -1L) &&
+              (hrc.getStartTime() == null || hrc.getStartTime() == -1L) &&
+              hrc.getAttemptCount() == 0){
+            LOG.info("Re-scheduling task with id: {} since it has 0 attempts, and null start_time and " +
+                    "original_start_time, which likely means the host was not heartbeating when the command was supposed to be scheduled.",
+                hrc.getTaskId());
+            allowRetry = true;
+          }
+
+          if (allowRetry) {
             retryHostRoleCommand(hrc);
           }
         }
@@ -262,6 +297,7 @@ public class RetryUpgradeActionService extends AbstractScheduledService {
     hrc.setStatus(HostRoleStatus.PENDING);
     hrc.setStartTime(-1L);
     // Don't change the original start time.
+    hrc.setEndTime(-1L);
     hrc.setLastAttemptTime(-1L);
 
     // This will invalidate the cache, as expected.

+ 23 - 5
ambari-server/src/test/java/org/apache/ambari/server/state/services/RetryUpgradeActionServiceTest.java

@@ -108,10 +108,12 @@ public class RetryUpgradeActionServiceTest {
    * Case 4: Cluster with an active upgrade that contains a failed task in HOLDING_FAILED that
    * does NOT meet conditions to be retried => no-op
    * Case 5: Cluster with an active upgrade that contains a failed task in HOLDING_FAILED that
-   * DOES meet conditions to be retried => retries the task
-   * Case 6: Cluster with an active upgrade that contains a failed task in HOLDING_FAILED that
+   * DOES meet conditions to be retried and has values for start time and original start time => retries the task
+   * * Case 6: Cluster with an active upgrade that contains a failed task in HOLDING_TIMEDOUT that
+   * DOES meet conditions to be retriedand does not have values for start time or original start time => retries the task
+   * Case 7: Cluster with an active upgrade that contains a failed task in HOLDING_FAILED that
    * was already retried and has now expired => no-op
-   * Case 7: Cluster with an active upgrade that contains a failed task in HOLDING_FAILED, but it is a critical task
+   * Case 8: Cluster with an active upgrade that contains a failed task in HOLDING_FAILED, but it is a critical task
    * during Finalize Cluster, which should not be retried => no-op
    * @throws Exception
    */
@@ -184,7 +186,23 @@ public class RetryUpgradeActionServiceTest {
     // Ensure that task 2 transitioned from HOLDING_FAILED to PENDING
     Assert.assertEquals(HostRoleStatus.PENDING, hostRoleCommandDAO.findByPK(hrc2.getTaskId()).getStatus());
 
-    // Case 6: Cluster with an active upgrade that contains a failed task in HOLDING_FAILED that was already retried and has now expired.
+    // Case 6: Cluster with an active upgrade that contains a failed task in HOLDING_FAILED that DOES meet conditions to be retried.
+    hrc2.setStatus(HostRoleStatus.HOLDING_TIMEDOUT);
+    hrc2.setRetryAllowed(true);
+    hrc2.setOriginalStartTime(-1L);
+    hrc2.setStartTime(-1L);
+    hrc2.setLastAttemptTime(-1L);
+    hrc2.setEndTime(-1L);
+    hrc2.setAttemptCount((short) 0);
+    hostRoleCommandDAO.merge(hrc2);
+
+    // Run the service
+    service.runOneIteration();
+
+    // Ensure that task 2 transitioned from HOLDING_TIMEDOUT to PENDING
+    Assert.assertEquals(HostRoleStatus.PENDING, hostRoleCommandDAO.findByPK(hrc2.getTaskId()).getStatus());
+    
+    // Case 7: Cluster with an active upgrade that contains a failed task in HOLDING_FAILED that was already retried and has now expired.
     now = System.currentTimeMillis();
     hrc2.setOriginalStartTime(now - (timeoutMins * 60000) - 1);
     hrc2.setStatus(HostRoleStatus.HOLDING_FAILED);
@@ -195,7 +213,7 @@ public class RetryUpgradeActionServiceTest {
 
     Assert.assertEquals(HostRoleStatus.HOLDING_FAILED, hostRoleCommandDAO.findByPK(hrc2.getTaskId()).getStatus());
 
-    // Case 7: Cluster with an active upgrade that contains a failed task in HOLDING_FAILED, but it is a critical task
+    // Case 8: Cluster with an active upgrade that contains a failed task in HOLDING_FAILED, but it is a critical task
     // during Finalize Cluster, which should not be retried.
     now = System.currentTimeMillis();
     hrc2.setOriginalStartTime(now);