Browse Source

YARN-4431. Not necessary to do unRegisterNM() if NM get stop due to failed to connect to RM. (Junpin Du via rohithsharmaks)

rohithsharmaks 9 years ago
parent
commit
6a6bbc51c4

+ 3 - 0
hadoop-yarn-project/CHANGES.txt

@@ -1031,6 +1031,9 @@ Release 2.8.0 - UNRELEASED
     YARN-4408. Fix issue that NodeManager reports negative running containers. 
     YARN-4408. Fix issue that NodeManager reports negative running containers. 
     (Robert Kanter via junping_du)
     (Robert Kanter via junping_du)
 
 
+    YARN-4431. Not necessary to do unRegisterNM() if NM get stop due to failed to connect
+    to RM. (Junping Du via rohithsharmaks)
+
 Release 2.7.3 - UNRELEASED
 Release 2.7.3 - UNRELEASED
 
 
   INCOMPATIBLE CHANGES
   INCOMPATIBLE CHANGES

+ 4 - 1
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java

@@ -134,6 +134,7 @@ public class NodeStatusUpdaterImpl extends AbstractService implements
 
 
   private Runnable statusUpdaterRunnable;
   private Runnable statusUpdaterRunnable;
   private Thread  statusUpdater;
   private Thread  statusUpdater;
+  private boolean failedToConnect = false;
   private long rmIdentifier = ResourceManagerConstants.RM_INVALID_IDENTIFIER;
   private long rmIdentifier = ResourceManagerConstants.RM_INVALID_IDENTIFIER;
   private boolean registeredWithRM = false;
   private boolean registeredWithRM = false;
   Set<ContainerId> pendingContainersToRemove = new HashSet<ContainerId>();
   Set<ContainerId> pendingContainersToRemove = new HashSet<ContainerId>();
@@ -241,7 +242,7 @@ public class NodeStatusUpdaterImpl extends AbstractService implements
     // the isStopped check is for avoiding multiple unregistrations.
     // the isStopped check is for avoiding multiple unregistrations.
     if (this.registeredWithRM && !this.isStopped
     if (this.registeredWithRM && !this.isStopped
         && !isNMUnderSupervisionWithRecoveryEnabled()
         && !isNMUnderSupervisionWithRecoveryEnabled()
-        && !context.getDecommissioned()) {
+        && !context.getDecommissioned() && !failedToConnect) {
       unRegisterNM();
       unRegisterNM();
     }
     }
     // Interrupt the updater.
     // Interrupt the updater.
@@ -823,6 +824,8 @@ public class NodeStatusUpdaterImpl extends AbstractService implements
             //catch and throw the exception if tried MAX wait time to connect RM
             //catch and throw the exception if tried MAX wait time to connect RM
             dispatcher.getEventHandler().handle(
             dispatcher.getEventHandler().handle(
                 new NodeManagerEvent(NodeManagerEventType.SHUTDOWN));
                 new NodeManagerEvent(NodeManagerEventType.SHUTDOWN));
+            // failed to connect to RM.
+            failedToConnect = true;
             throw new YarnRuntimeException(e);
             throw new YarnRuntimeException(e);
           } catch (Throwable e) {
           } catch (Throwable e) {