Browse Source

YARN-4101. RM should print alert messages if Zookeeper and Resourcemanager gets connection issue. Contributed by Xuan Gong
(cherry picked from commit 09c64ba1ba8be7a2ac31f4e42efb8c99b682399f)

Jian He 10 năm trước cách đây
mục cha
commit
cd82fa2f83

+ 4 - 0
hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ActiveStandbyElector.java

@@ -1141,4 +1141,8 @@ public class ActiveStandbyElector implements StatCallback, StringCallback {
       ((appData == null) ? "null" : StringUtils.byteToHexString(appData)) + 
       " cb=" + appClient;
   }
+
+  public String getHAZookeeperConnectionState() {
+    return this.zkConnectionState.name();
+  }
 }

+ 3 - 0
hadoop-yarn-project/CHANGES.txt

@@ -819,6 +819,9 @@ Release 2.7.2 - UNRELEASED
     YARN-3893. Both RM in active state when Admin#transitionToActive failure 
     from refeshAll() (Bibin A Chundatt via rohithsharmaks)
 
+    YARN-4101. RM should print alert messages if Zookeeper and Resourcemanager
+    gets connection issue. (Xuan Gong via jianhe)
+
 Release 2.7.1 - 2015-07-06
 
   INCOMPATIBLE CHANGES

+ 9 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/AdminService.java

@@ -782,4 +782,13 @@ public class AdminService extends CompositeService implements
       throw logAndWrapException(e, user.getShortUserName(), argName, msg);
     }
   }
+
+  public String getHAZookeeperConnectionState() {
+    if (!rmContext.isHAEnabled()) {
+      return "ResourceManager HA is not enabled.";
+    } else if (!autoFailoverEnabled) {
+      return "Auto Failover is not enabled.";
+    }
+    return this.embeddedElector.getHAZookeeperConnectionState();
+  }
 }

+ 4 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/EmbeddedElectorService.java

@@ -205,4 +205,8 @@ public class EmbeddedElectorService extends AbstractService
     elector.quitElection(false);
     elector.joinElection(localActiveNodeInfo);
   }
+
+  public String getHAZookeeperConnectionState() {
+    return elector.getHAZookeeperConnectionState();
+  }
 }

+ 2 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/AboutBlock.java

@@ -44,6 +44,8 @@ public class AboutBlock extends HtmlBlock {
       _("Cluster ID:", cinfo.getClusterId()).
       _("ResourceManager state:", cinfo.getState()).
       _("ResourceManager HA state:", cinfo.getHAState()).
+      _("ResourceManager HA zookeeper connection state:",
+          cinfo.getHAZookeeperConnectionState()).
       _("ResourceManager RMStateStore:", cinfo.getRMStateStore()).
       _("ResourceManager started on:", Times.format(cinfo.getStartedOn())).
       _("ResourceManager version:", cinfo.getRMBuildVersion() +

+ 5 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/RMWebApp.java

@@ -117,4 +117,9 @@ public class RMWebApp extends WebApp implements YarnWebParams {
     }
     return path;
   }
+
+  public String getHAZookeeperConnectionState() {
+    return rm.getRMContext().getRMAdminService()
+      .getHAZookeeperConnectionState();
+  }
 }

+ 4 - 1
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/RMWebAppFilter.java

@@ -58,6 +58,7 @@ public class RMWebAppFilter extends GuiceContainer {
   private String path;
   private static final int BASIC_SLEEP_TIME = 5;
   private static final int MAX_SLEEP_TIME = 5 * 60;
+  private static final Random randnum = new Random();
 
   @Inject
   public RMWebAppFilter(Injector injector, Configuration conf) {
@@ -126,6 +127,8 @@ public class RMWebAppFilter extends GuiceContainer {
         String redirectMsg =
             doRetry ? "Can not find any active RM. Will retry in next " + next
                 + " seconds." : "There is no active RM right now.";
+        redirectMsg += "\nHA Zookeeper Connection State: "
+            + rmWebApp.getHAZookeeperConnectionState();
         PrintWriter out = response.getWriter();
         out.println(redirectMsg);
         if (doRetry) {
@@ -172,6 +175,6 @@ public class RMWebAppFilter extends GuiceContainer {
 
   private static int calculateExponentialTime(int retries) {
     long baseTime = BASIC_SLEEP_TIME * (1L << retries);
-    return (int) (baseTime * ((new Random()).nextDouble() + 0.5));
+    return (int) (baseTime * (randnum.nextDouble() + 0.5));
   }
 }

+ 6 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/dao/ClusterInfo.java

@@ -43,6 +43,7 @@ public class ClusterInfo {
   protected String hadoopVersion;
   protected String hadoopBuildVersion;
   protected String hadoopVersionBuiltOn;
+  protected String haZooKeeperConnectionState;
 
   public ClusterInfo() {
   } // JAXB needs this
@@ -62,6 +63,8 @@ public class ClusterInfo {
     this.hadoopVersion = VersionInfo.getVersion();
     this.hadoopBuildVersion = VersionInfo.getBuildVersion();
     this.hadoopVersionBuiltOn = VersionInfo.getDate();
+    this.haZooKeeperConnectionState =
+        rm.getRMContext().getRMAdminService().getHAZookeeperConnectionState();
   }
 
   public String getState() {
@@ -108,4 +111,7 @@ public class ClusterInfo {
     return this.startedOn;
   }
 
+  public String getHAZookeeperConnectionState() {
+    return this.haZooKeeperConnectionState;
+  }
 }

+ 6 - 2
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/TestRMWebServices.java

@@ -285,6 +285,8 @@ public class TestRMWebServices extends JerseyTestBase {
           WebServicesTestUtils.getXmlLong(element, "startedOn"),
           WebServicesTestUtils.getXmlString(element, "state"),
           WebServicesTestUtils.getXmlString(element, "haState"),
+          WebServicesTestUtils.getXmlString(
+              element, "haZooKeeperConnectionState"),
           WebServicesTestUtils.getXmlString(element, "hadoopVersionBuiltOn"),
           WebServicesTestUtils.getXmlString(element, "hadoopBuildVersion"),
           WebServicesTestUtils.getXmlString(element, "hadoopVersion"),
@@ -300,9 +302,10 @@ public class TestRMWebServices extends JerseyTestBase {
       Exception {
     assertEquals("incorrect number of elements", 1, json.length());
     JSONObject info = json.getJSONObject("clusterInfo");
-    assertEquals("incorrect number of elements", 11, info.length());
+    assertEquals("incorrect number of elements", 12, info.length());
     verifyClusterGeneric(info.getLong("id"), info.getLong("startedOn"),
         info.getString("state"), info.getString("haState"),
+        info.getString("haZooKeeperConnectionState"),
         info.getString("hadoopVersionBuiltOn"),
         info.getString("hadoopBuildVersion"), info.getString("hadoopVersion"),
         info.getString("resourceManagerVersionBuiltOn"),
@@ -312,7 +315,8 @@ public class TestRMWebServices extends JerseyTestBase {
   }
 
   public void verifyClusterGeneric(long clusterid, long startedon,
-      String state, String haState, String hadoopVersionBuiltOn,
+      String state, String haState, String haZooKeeperConnectionState,
+      String hadoopVersionBuiltOn,
       String hadoopBuildVersion, String hadoopVersion,
       String resourceManagerVersionBuiltOn, String resourceManagerBuildVersion,
       String resourceManagerVersion) {