浏览代码

HDDS-571. Update SCM chill mode exit criteria to optionally wait for n datanodes. Contributed by Ajay Kumar.

Ajay Kumar 6 年之前
父节点
当前提交
cdf5d58364

+ 3 - 0
hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/HddsConfigKeys.java

@@ -83,6 +83,9 @@ public final class HddsConfigKeys {
   public static final String HDDS_SCM_CHILLMODE_ENABLED =
   public static final String HDDS_SCM_CHILLMODE_ENABLED =
       "hdds.scm.chillmode.enabled";
       "hdds.scm.chillmode.enabled";
   public static final boolean HDDS_SCM_CHILLMODE_ENABLED_DEFAULT = true;
   public static final boolean HDDS_SCM_CHILLMODE_ENABLED_DEFAULT = true;
+  public static final String HDDS_SCM_CHILLMODE_MIN_DATANODE =
+      "hdds.scm.chillmode.min.datanode";
+  public static final int HDDS_SCM_CHILLMODE_MIN_DATANODE_DEFAULT = 1;
 
 
   // % of containers which should have at least one reported replica
   // % of containers which should have at least one reported replica
   // before SCM comes out of chill mode.
   // before SCM comes out of chill mode.

+ 9 - 0
hadoop-hdds/common/src/main/resources/ozone-default.xml

@@ -1164,6 +1164,15 @@
     </description>
     </description>
   </property>
   </property>
 
 
+  <property>
+    <name>hdds.scm.chillmode.min.datanode</name>
+    <value>1</value>
+    <tag>HDDS,SCM,OPERATION</tag>
+    <description>Minimum DataNodes which should be registered to get SCM out of
+      chill mode.
+    </description>
+  </property>
+
   <property>
   <property>
     <name>hdds.container.action.max.limit</name>
     <name>hdds.container.action.max.limit</name>
     <value>20</value>
     <value>20</value>

+ 61 - 2
hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMChillModeManager.java

@@ -20,8 +20,10 @@ package org.apache.hadoop.hdds.scm.server;
 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.annotations.VisibleForTesting;
 import java.util.EnumSet;
 import java.util.EnumSet;
 import java.util.HashMap;
 import java.util.HashMap;
+import java.util.HashSet;
 import java.util.List;
 import java.util.List;
 import java.util.Map;
 import java.util.Map;
+import java.util.UUID;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.atomic.AtomicBoolean;
 import java.util.concurrent.atomic.AtomicBoolean;
 import java.util.concurrent.atomic.AtomicLong;
 import java.util.concurrent.atomic.AtomicLong;
@@ -60,14 +62,16 @@ public class SCMChillModeManager implements
   private Map<String, ChillModeExitRule> exitRules = new HashMap(1);
   private Map<String, ChillModeExitRule> exitRules = new HashMap(1);
   private Configuration config;
   private Configuration config;
   private static final String CONT_EXIT_RULE = "ContainerChillModeRule";
   private static final String CONT_EXIT_RULE = "ContainerChillModeRule";
+  private static final String DN_EXIT_RULE = "DataNodeChillModeRule";
   private final EventQueue eventPublisher;
   private final EventQueue eventPublisher;
 
 
   SCMChillModeManager(Configuration conf, List<ContainerInfo> allContainers,
   SCMChillModeManager(Configuration conf, List<ContainerInfo> allContainers,
       EventQueue eventQueue) {
       EventQueue eventQueue) {
     this.config = conf;
     this.config = conf;
     this.eventPublisher = eventQueue;
     this.eventPublisher = eventQueue;
-    exitRules
-        .put(CONT_EXIT_RULE, new ContainerChillModeRule(config, allContainers));
+    exitRules.put(CONT_EXIT_RULE,
+        new ContainerChillModeRule(config, allContainers));
+    exitRules.put(DN_EXIT_RULE, new DataNodeChillModeRule(config));
     if (!conf.getBoolean(HddsConfigKeys.HDDS_SCM_CHILLMODE_ENABLED,
     if (!conf.getBoolean(HddsConfigKeys.HDDS_SCM_CHILLMODE_ENABLED,
         HddsConfigKeys.HDDS_SCM_CHILLMODE_ENABLED_DEFAULT)) {
         HddsConfigKeys.HDDS_SCM_CHILLMODE_ENABLED_DEFAULT)) {
       exitChillMode(eventQueue);
       exitChillMode(eventQueue);
@@ -120,6 +124,7 @@ public class SCMChillModeManager implements
       EventPublisher publisher) {
       EventPublisher publisher) {
     if (getInChillMode()) {
     if (getInChillMode()) {
       exitRules.get(CONT_EXIT_RULE).process(nodeRegistrationContainerReport);
       exitRules.get(CONT_EXIT_RULE).process(nodeRegistrationContainerReport);
+      exitRules.get(DN_EXIT_RULE).process(nodeRegistrationContainerReport);
       validateChillModeExitRules(publisher);
       validateChillModeExitRules(publisher);
     }
     }
   }
   }
@@ -187,6 +192,9 @@ public class SCMChillModeManager implements
 
 
     @VisibleForTesting
     @VisibleForTesting
     public double getCurrentContainerThreshold() {
     public double getCurrentContainerThreshold() {
+      if (maxContainer == 0) {
+        return 1;
+      }
       return (containerWithMinReplicas.doubleValue() / maxContainer);
       return (containerWithMinReplicas.doubleValue() / maxContainer);
     }
     }
 
 
@@ -217,6 +225,57 @@ public class SCMChillModeManager implements
     }
     }
   }
   }
 
 
+  /**
+   * Class defining Chill mode exit criteria according to number of DataNodes
+   * registered with SCM.
+   */
+  public class DataNodeChillModeRule implements
+      ChillModeExitRule<NodeRegistrationContainerReport> {
+
+    // Min DataNodes required to exit chill mode.
+    private int requiredDns;
+    private int registeredDns = 0;
+    // Set to track registered DataNodes.
+    private HashSet<UUID> registeredDnSet;
+
+    public DataNodeChillModeRule(Configuration conf) {
+      requiredDns = conf
+          .getInt(HddsConfigKeys.HDDS_SCM_CHILLMODE_MIN_DATANODE,
+              HddsConfigKeys.HDDS_SCM_CHILLMODE_MIN_DATANODE_DEFAULT);
+      registeredDnSet = new HashSet<>(requiredDns * 2);
+    }
+
+    @Override
+    public boolean validate() {
+      return registeredDns >= requiredDns;
+    }
+
+    @VisibleForTesting
+    public double getRegisteredDataNodes() {
+      return registeredDns;
+    }
+
+    @Override
+    public void process(NodeRegistrationContainerReport reportsProto) {
+      if (requiredDns == 0) {
+        // No dn check required.
+        return;
+      }
+
+      if(inChillMode.get()) {
+        registeredDnSet.add(reportsProto.getDatanodeDetails().getUuid());
+        registeredDns = registeredDnSet.size();
+        LOG.info("SCM in chill mode. {} DataNodes registered, {} required.",
+            registeredDns, requiredDns);
+      }
+    }
+
+    @Override
+    public void cleanup() {
+      registeredDnSet.clear();
+    }
+  }
+
   @VisibleForTesting
   @VisibleForTesting
   public static Logger getLogger() {
   public static Logger getLogger() {
     return LOG;
     return LOG;

+ 40 - 1
hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/server/TestSCMChillModeManager.java

@@ -45,7 +45,7 @@ public class TestSCMChillModeManager {
   private List<ContainerInfo> containers;
   private List<ContainerInfo> containers;
 
 
   @Rule
   @Rule
-  public Timeout timeout = new Timeout(1000 * 20);
+  public Timeout timeout = new Timeout(1000 * 35);
 
 
   @BeforeClass
   @BeforeClass
   public static void setUp() {
   public static void setUp() {
@@ -111,6 +111,45 @@ public class TestSCMChillModeManager {
     assertFalse(scmChillModeManager.getInChillMode());
     assertFalse(scmChillModeManager.getInChillMode());
   }
   }
 
 
+  @Test
+  public void testChillModeDataNodeExitRule() throws Exception {
+    containers = new ArrayList<>();
+    testChillModeDataNodes(0);
+    testChillModeDataNodes(3);
+    testChillModeDataNodes(5);
+  }
+
+  private void testChillModeDataNodes(int numOfDns) throws Exception {
+    OzoneConfiguration conf = new OzoneConfiguration(config);
+    conf.setInt(HddsConfigKeys.HDDS_SCM_CHILLMODE_MIN_DATANODE, numOfDns);
+    scmChillModeManager = new SCMChillModeManager(conf, containers, queue);
+    queue.addHandler(SCMEvents.NODE_REGISTRATION_CONT_REPORT,
+        scmChillModeManager);
+    // Assert SCM is in Chill mode.
+    assertTrue(scmChillModeManager.getInChillMode());
+
+    // Register all DataNodes except last one and assert SCM is in chill mode.
+    for (int i = 0; i < numOfDns-1; i++) {
+      queue.fireEvent(SCMEvents.NODE_REGISTRATION_CONT_REPORT,
+          HddsTestUtils.createNodeRegistrationContainerReport(containers));
+      assertTrue(scmChillModeManager.getInChillMode());
+      assertTrue(scmChillModeManager.getCurrentContainerThreshold() == 1);
+    }
+
+    if(numOfDns == 0){
+      GenericTestUtils.waitFor(() -> {
+        return scmChillModeManager.getInChillMode();
+      }, 10, 1000 * 10);
+      return;
+    }
+    // Register last DataNode and check that SCM is out of Chill mode.
+    queue.fireEvent(SCMEvents.NODE_REGISTRATION_CONT_REPORT,
+        HddsTestUtils.createNodeRegistrationContainerReport(containers));
+    GenericTestUtils.waitFor(() -> {
+      return scmChillModeManager.getInChillMode();
+    }, 10, 1000 * 10);
+  }
+
   private void testContainerThreshold(List<ContainerInfo> dnContainers,
   private void testContainerThreshold(List<ContainerInfo> dnContainers,
       double expectedThreshold)
       double expectedThreshold)
       throws Exception {
       throws Exception {