Browse Source

YARN-502. Fixed a state machine issue with RMNode inside ResourceManager which was crashing scheduler. Contributed by Mayank Bansal.
svn merge --ignore-ancestry -c 1509060 ../../trunk/


git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/branch-2@1509061 13f79535-47bb-0310-9956-ffa450edef68

Vinod Kumar Vavilapalli 12 years ago
parent
commit
b1cec77b5f

+ 3 - 0
hadoop-yarn-project/CHANGES.txt

@@ -30,6 +30,9 @@ Release 2.1.1-beta - UNRELEASED
     YARN-966. Fixed ContainerLaunch to not fail quietly when there are no
     YARN-966. Fixed ContainerLaunch to not fail quietly when there are no
     localized resources due to some other failure. (Zhijie Shen via vinodkv)
     localized resources due to some other failure. (Zhijie Shen via vinodkv)
 
 
+    YARN-502. Fixed a state machine issue with RMNode inside ResourceManager
+    which was crashing scheduler. (Mayank Bansal via vinodkv)
+
 Release 2.1.0-beta - 2013-08-06
 Release 2.1.0-beta - 2013-08-06
 
 
   INCOMPATIBLE CHANGES
   INCOMPATIBLE CHANGES

+ 7 - 2
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNodeImpl.java

@@ -501,8 +501,13 @@ public class RMNodeImpl implements RMNode, EventHandler<RMNodeEvent> {
     public void transition(RMNodeImpl rmNode, RMNodeEvent event) {
     public void transition(RMNodeImpl rmNode, RMNodeEvent event) {
       // Inform the scheduler
       // Inform the scheduler
       rmNode.nodeUpdateQueue.clear();
       rmNode.nodeUpdateQueue.clear();
-      rmNode.context.getDispatcher().getEventHandler().handle(
-          new NodeRemovedSchedulerEvent(rmNode));
+      // If the current state is NodeState.UNHEALTHY
+      // Then node is already been removed from the
+      // Scheduler
+      if (!rmNode.getState().equals(NodeState.UNHEALTHY)) {
+        rmNode.context.getDispatcher().getEventHandler()
+          .handle(new NodeRemovedSchedulerEvent(rmNode));
+      }
       rmNode.context.getDispatcher().getEventHandler().handle(
       rmNode.context.getDispatcher().getEventHandler().handle(
           new NodesListManagerEvent(
           new NodesListManagerEvent(
               NodesListManagerEventType.NODE_UNUSABLE, rmNode));
               NodesListManagerEventType.NODE_UNUSABLE, rmNode));

+ 10 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMNodeTransitions.java

@@ -48,6 +48,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeStatusEvent;
 import org.apache.hadoop.yarn.server.resourcemanager.rmnode.UpdatedContainerInfo;
 import org.apache.hadoop.yarn.server.resourcemanager.rmnode.UpdatedContainerInfo;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.YarnScheduler;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.YarnScheduler;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeAddedSchedulerEvent;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeAddedSchedulerEvent;
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeRemovedSchedulerEvent;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeUpdateSchedulerEvent;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeUpdateSchedulerEvent;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.SchedulerEvent;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.SchedulerEvent;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.SchedulerEventType;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.SchedulerEventType;
@@ -269,6 +270,15 @@ public class TestRMNodeTransitions {
     node.handle(new RMNodeEvent(node.getNodeID(), RMNodeEventType.EXPIRE));
     node.handle(new RMNodeEvent(node.getNodeID(), RMNodeEventType.EXPIRE));
     Assert.assertEquals(NodeState.LOST, node.getState());
     Assert.assertEquals(NodeState.LOST, node.getState());
   }
   }
+  
+  @Test
+  public void testUnhealthyExpireForSchedulerRemove() {
+    RMNodeImpl node = getUnhealthyNode();
+    verify(scheduler,times(2)).handle(any(NodeRemovedSchedulerEvent.class));
+    node.handle(new RMNodeEvent(node.getNodeID(), RMNodeEventType.EXPIRE));
+    verify(scheduler,times(2)).handle(any(NodeRemovedSchedulerEvent.class));
+    Assert.assertEquals(NodeState.LOST, node.getState());
+  }
 
 
   @Test
   @Test
   public void testRunningDecommission() {
   public void testRunningDecommission() {