8 vuotta sitten · d67d8a08fe
--- a/ambari-server/src/main/java/org/apache/ambari/server/actionmanager/HostRoleCommand.java
+++ b/ambari-server/src/main/java/org/apache/ambari/server/actionmanager/HostRoleCommand.java
@@ -168,10 +168,10 @@ public class HostRoleCommand {
 
				     errorLog = hostRoleCommandEntity.getErrorLog();
			
 
				     structuredOut = hostRoleCommandEntity.getStructuredOut() != null ? new String(hostRoleCommandEntity.getStructuredOut()) : "";
			
 
				     exitCode = hostRoleCommandEntity.getExitcode();
			
 
				-    startTime = hostRoleCommandEntity.getStartTime();
			
 
				-    originalStartTime = hostRoleCommandEntity.getOriginalStartTime();
			
 
				+    startTime = hostRoleCommandEntity.getStartTime() != null ? hostRoleCommandEntity.getStartTime() : -1L;
			
 
				+    originalStartTime = hostRoleCommandEntity.getOriginalStartTime() != null ? hostRoleCommandEntity.getOriginalStartTime() : -1L;
			
 
				     endTime = hostRoleCommandEntity.getEndTime() != null ? hostRoleCommandEntity.getEndTime() : -1L;
			
 
				-    lastAttemptTime = hostRoleCommandEntity.getLastAttemptTime();
			
 
				+    lastAttemptTime = hostRoleCommandEntity.getLastAttemptTime() != null ? hostRoleCommandEntity.getLastAttemptTime() : -1L;
			
 
				     attemptCount = hostRoleCommandEntity.getAttemptCount();
			
 
				     retryAllowed = hostRoleCommandEntity.isRetryAllowed();
			
 
				     autoSkipFailure = hostRoleCommandEntity.isFailureAutoSkipped();
			
--- a/ambari-server/src/main/java/org/apache/ambari/server/state/services/RetryUpgradeActionService.java
+++ b/ambari-server/src/main/java/org/apache/ambari/server/state/services/RetryUpgradeActionService.java
@@ -199,24 +199,59 @@ public class RetryUpgradeActionService extends AbstractScheduledService {
 
				     List<HostRoleCommandEntity> holdingCommands = m_hostRoleCommandDAO.findByRequestIdAndStatuses(requestId, HOLDING_STATUSES);
			
 
				     if (holdingCommands.size() > 0) {
			
 
				       for (HostRoleCommandEntity hrc : holdingCommands) {
			
 
				-        LOG.debug("Comparing taskId: {}, original start time: {}, now: {}",
			
 
				-            hrc.getTaskId(), hrc.getOriginalStartTime(), now);
			
 
				+        LOG.debug("Comparing taskId: {}, attempt count: {}, original start time: {}, now: {}",
			
 
				+            hrc.getTaskId(), hrc.getAttemptCount(), hrc.getOriginalStartTime(), now);
			
 
				 
			
 
				         /*
			
 
				+        Use-Case 1:
			
 
				+        If the command has been sent to the host before because it was heartbeating, then it does have
			
 
				+        an original start time, so we can attempt to retry on this host even if no longer heartbeating.
			
 
				+        If the host does heartbeat again within the time interval, the command will actually be scheduled by the host.
			
 
				+
			
 
				+        Use-Case 2:
			
 
				+        If the host is not heartbeating and the command is scheduled to be ran on it, then it means the following
			
 
				+        is true,
			
 
				+        - does not have original start time
			
 
				+        - does not have start time
			
 
				+        - attempt count is 0
			
 
				+        - status will be HOLDING_TIMEDOUT
			
 
				+        When the host does start heartbeating, we need to schedule this command by changing its state back to PENDING.
			
 
				+
			
 
				+        Notes:
			
 
				         While testing, can update the original_start_time of records in host_role_command table to current epoch time.
			
 
				         E.g. in postgres,
			
 
				         SELECT CAST(EXTRACT(EPOCH FROM NOW()) AS BIGINT) * 1000;
			
 
				         UPDATE host_role_command SET attempt_count = 1, status = 'HOLDING_FAILED', original_start_time = (CAST(EXTRACT(EPOCH FROM NOW()) AS BIGINT) * 1000) WHERE task_id IN (x, y, z);
			
 
				-         */
			
 
				-        if (canRetryCommand(hrc) && hrc.getOriginalStartTime() > 0 && hrc.getOriginalStartTime() < now) {
			
 
				-          Long retryTimeWindow = hrc.getOriginalStartTime() + MAX_TIMEOUT_MS;
			
 
				-          Long deltaMS = retryTimeWindow - now;
			
 
				-
			
 
				-          if (deltaMS > 0) {
			
 
				-            String originalStartTimeString = m_fullDateFormat.format(new Date(hrc.getOriginalStartTime()));
			
 
				-            String deltaString = m_deltaDateFormat.format(new Date(deltaMS));
			
 
				-            LOG.info("Retrying task with id: {}, attempts: {}, original start time: {}, time til timeout: {}",
			
 
				-                hrc.getTaskId(), hrc.getAttemptCount(), originalStartTimeString, deltaString);
			
 
				+        */
			
 
				+
			
 
				+        if (canRetryCommand(hrc)) {
			
 
				+
			
 
				+          boolean allowRetry = false;
			
 
				+          // Use-Case 1
			
 
				+          if (hrc.getOriginalStartTime() != null && hrc.getOriginalStartTime() > 0 && hrc.getOriginalStartTime() < now) {
			
 
				+            Long retryTimeWindow = hrc.getOriginalStartTime() + MAX_TIMEOUT_MS;
			
 
				+            Long deltaMS = retryTimeWindow - now;
			
 
				+
			
 
				+            if (deltaMS > 0) {
			
 
				+              String originalStartTimeString = m_fullDateFormat.format(new Date(hrc.getOriginalStartTime()));
			
 
				+              String deltaString = m_deltaDateFormat.format(new Date(deltaMS));
			
 
				+              LOG.info("Retrying task with id: {}, attempts: {}, original start time: {}, time til timeout: {}",
			
 
				+                  hrc.getTaskId(), hrc.getAttemptCount(), originalStartTimeString, deltaString);
			
 
				+              allowRetry = true;
			
 
				+            }
			
 
				+          }
			
 
				+
			
 
				+          // Use-Case 2
			
 
				+          if ((hrc.getOriginalStartTime() == null || hrc.getOriginalStartTime() == -1L) &&
			
 
				+              (hrc.getStartTime() == null || hrc.getStartTime() == -1L) &&
			
 
				+              hrc.getAttemptCount() == 0){
			
 
				+            LOG.info("Re-scheduling task with id: {} since it has 0 attempts, and null start_time and " +
			
 
				+                    "original_start_time, which likely means the host was not heartbeating when the command was supposed to be scheduled.",
			
 
				+                hrc.getTaskId());
			
 
				+            allowRetry = true;
			
 
				+          }
			
 
				+
			
 
				+          if (allowRetry) {
			
 
				             retryHostRoleCommand(hrc);
			
 
				           }
			
 
				         }
			
@@ -262,6 +297,7 @@ public class RetryUpgradeActionService extends AbstractScheduledService {
 
				     hrc.setStatus(HostRoleStatus.PENDING);
			
 
				     hrc.setStartTime(-1L);
			
 
				     // Don't change the original start time.
			
 
				+    hrc.setEndTime(-1L);
			
 
				     hrc.setLastAttemptTime(-1L);
			
 
				 
			
 
				     // This will invalidate the cache, as expected.
			
--- a/ambari-server/src/test/java/org/apache/ambari/server/state/services/RetryUpgradeActionServiceTest.java
+++ b/ambari-server/src/test/java/org/apache/ambari/server/state/services/RetryUpgradeActionServiceTest.java
@@ -108,10 +108,12 @@ public class RetryUpgradeActionServiceTest {
 
				    * Case 4: Cluster with an active upgrade that contains a failed task in HOLDING_FAILED that
			
 
				    * does NOT meet conditions to be retried => no-op
			
 
				    * Case 5: Cluster with an active upgrade that contains a failed task in HOLDING_FAILED that
			
 
				-   * DOES meet conditions to be retried => retries the task
			
 
				-   * Case 6: Cluster with an active upgrade that contains a failed task in HOLDING_FAILED that
			
 
				+   * DOES meet conditions to be retried and has values for start time and original start time => retries the task
			
 
				+   * * Case 6: Cluster with an active upgrade that contains a failed task in HOLDING_TIMEDOUT that
			
 
				+   * DOES meet conditions to be retriedand does not have values for start time or original start time => retries the task
			
 
				+   * Case 7: Cluster with an active upgrade that contains a failed task in HOLDING_FAILED that
			
 
				    * was already retried and has now expired => no-op
			
 
				-   * Case 7: Cluster with an active upgrade that contains a failed task in HOLDING_FAILED, but it is a critical task
			
 
				+   * Case 8: Cluster with an active upgrade that contains a failed task in HOLDING_FAILED, but it is a critical task
			
 
				    * during Finalize Cluster, which should not be retried => no-op
			
 
				    * @throws Exception
			
 
				    */
			
@@ -184,7 +186,23 @@ public class RetryUpgradeActionServiceTest {
 
				     // Ensure that task 2 transitioned from HOLDING_FAILED to PENDING
			
 
				     Assert.assertEquals(HostRoleStatus.PENDING, hostRoleCommandDAO.findByPK(hrc2.getTaskId()).getStatus());
			
 
				 
			
 
				-    // Case 6: Cluster with an active upgrade that contains a failed task in HOLDING_FAILED that was already retried and has now expired.
			
 
				+    // Case 6: Cluster with an active upgrade that contains a failed task in HOLDING_FAILED that DOES meet conditions to be retried.
			
 
				+    hrc2.setStatus(HostRoleStatus.HOLDING_TIMEDOUT);
			
 
				+    hrc2.setRetryAllowed(true);
			
 
				+    hrc2.setOriginalStartTime(-1L);
			
 
				+    hrc2.setStartTime(-1L);
			
 
				+    hrc2.setLastAttemptTime(-1L);
			
 
				+    hrc2.setEndTime(-1L);
			
 
				+    hrc2.setAttemptCount((short) 0);
			
 
				+    hostRoleCommandDAO.merge(hrc2);
			
 
				+
			
 
				+    // Run the service
			
 
				+    service.runOneIteration();
			
 
				+
			
 
				+    // Ensure that task 2 transitioned from HOLDING_TIMEDOUT to PENDING
			
 
				+    Assert.assertEquals(HostRoleStatus.PENDING, hostRoleCommandDAO.findByPK(hrc2.getTaskId()).getStatus());
			
 
				+    
			
 
				+    // Case 7: Cluster with an active upgrade that contains a failed task in HOLDING_FAILED that was already retried and has now expired.
			
 
				     now = System.currentTimeMillis();
			
 
				     hrc2.setOriginalStartTime(now - (timeoutMins * 60000) - 1);
			
 
				     hrc2.setStatus(HostRoleStatus.HOLDING_FAILED);
			
@@ -195,7 +213,7 @@ public class RetryUpgradeActionServiceTest {
 
				 
			
 
				     Assert.assertEquals(HostRoleStatus.HOLDING_FAILED, hostRoleCommandDAO.findByPK(hrc2.getTaskId()).getStatus());
			
 
				 
			
 
				-    // Case 7: Cluster with an active upgrade that contains a failed task in HOLDING_FAILED, but it is a critical task
			
 
				+    // Case 8: Cluster with an active upgrade that contains a failed task in HOLDING_FAILED, but it is a critical task
			
 
				     // during Finalize Cluster, which should not be retried.
			
 
				     now = System.currentTimeMillis();
			
 
				     hrc2.setOriginalStartTime(now);