浏览代码

HDDS-1027. Add blockade Tests for datanode isolation and scm failures. Contributed by Nilotpal Nandi.

Mukul Kumar Singh 6 年之前
父节点
当前提交
911790cc26

+ 143 - 0
hadoop-ozone/dist/src/main/blockade/test_blockade_mixed_failure_three_nodes_isolate.py

@@ -0,0 +1,143 @@
+#!/usr/bin/python
+
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import time
+import logging
+from blockadeUtils.blockade import Blockade
+from clusterUtils.cluster_utils import ClusterUtils
+
+
+logger = logging.getLogger(__name__)
+parent_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
+FILE = os.path.join(parent_dir, "compose", "ozoneblockade",
+                    "docker-compose.yaml")
+SCALE = 3
+CONTAINER_LIST = []
+OM = []
+SCM = []
+DATANODES = []
+
+
+def setup():
+    global CONTAINER_LIST, OM, SCM, DATANODES
+    Blockade.blockade_destroy()
+    CONTAINER_LIST = ClusterUtils.cluster_setup(FILE, SCALE)
+    exit_code, output = Blockade.blockade_status()
+    assert exit_code == 0, "blockade status command failed with output=[%s]" % \
+                           output
+    OM = filter(lambda x: 'ozoneManager' in x, CONTAINER_LIST)
+    SCM = filter(lambda x: 'scm' in x, CONTAINER_LIST)
+    DATANODES = sorted(list(filter(lambda x: 'datanode' in x, CONTAINER_LIST)))
+
+    exit_code, output = \
+        ClusterUtils.run_freon(FILE, 1, 1, 1, 10240, "RATIS", "THREE")
+    assert exit_code == 0, "freon run failed with output=[%s]" % output
+
+
+def teardown():
+    logger.info("Inside teardown")
+    Blockade.blockade_destroy()
+
+
+def teardown_module():
+    ClusterUtils.cluster_destroy(FILE)
+
+
+def test_three_dns_isolate_onescmfailure():
+    """
+    In this test, all datanodes are isolated from each other.
+    One of the datanodes (third datanode) cannot communicate with SCM.
+    Expectation :
+    The container replica state in first datanode should be closed.
+    The container replica state in second datanode should be closed.
+    The container replica state in third datanode should be open.
+    """
+    first_set = [OM[0], SCM[0], DATANODES[0]]
+    second_set = [OM[0], SCM[0], DATANODES[1]]
+    third_set = [OM[0], DATANODES[2]]
+    Blockade.blockade_create_partition(first_set, second_set, third_set)
+    Blockade.blockade_status()
+    ClusterUtils.run_freon(FILE, 1, 1, 1, 10240, "RATIS", "THREE")
+    logger.info("Waiting for %s seconds before checking container status",
+                os.environ["CONTAINER_STATUS_SLEEP"])
+    time.sleep(int(os.environ["CONTAINER_STATUS_SLEEP"]))
+    all_datanodes_container_status = \
+        ClusterUtils.find_all_datanodes_container_status(FILE, SCALE)
+    first_datanode_status = all_datanodes_container_status[0]
+    second_datanode_status = all_datanodes_container_status[1]
+    third_datanode_status = all_datanodes_container_status[2]
+    assert first_datanode_status == 'CLOSED'
+    assert second_datanode_status == 'CLOSED'
+    assert third_datanode_status == 'OPEN'
+
+
+def test_three_dns_isolate_twoscmfailure():
+    """
+    In this test, all datanodes are isolated from each other.
+    two datanodes cannot communicate with SCM (second datanode and third
+    datanode)
+    Expectation :
+    The container replica state in first datanode should be quasi-closed.
+    The container replica state in second datanode should be open.
+    The container replica state in third datanode should be open.
+    """
+    first_set = [OM[0], SCM[0], DATANODES[0]]
+    second_set = [OM[0], DATANODES[1]]
+    third_set = [OM[0], DATANODES[2]]
+    Blockade.blockade_create_partition(first_set, second_set, third_set)
+    Blockade.blockade_status()
+    ClusterUtils.run_freon(FILE, 1, 1, 1, 10240, "RATIS", "THREE")
+    logger.info("Waiting for %s seconds before checking container status",
+                os.environ["CONTAINER_STATUS_SLEEP"])
+    time.sleep(int(os.environ["CONTAINER_STATUS_SLEEP"]))
+    all_datanodes_container_status = \
+        ClusterUtils.find_all_datanodes_container_status(FILE, SCALE)
+    first_datanode_status = all_datanodes_container_status[0]
+    second_datanode_status = all_datanodes_container_status[1]
+    third_datanode_status = all_datanodes_container_status[2]
+    assert first_datanode_status == 'QUASI_CLOSED'
+    assert second_datanode_status == 'OPEN'
+    assert third_datanode_status == 'OPEN'
+
+
+def test_three_dns_isolate_threescmfailure():
+    """
+    In this test, all datanodes are isolated from each other and also cannot
+    communicate with SCM.
+    Expectation :
+    The container replica state in first datanode should be open.
+    The container replica state in second datanode should be open.
+    The container replica state in third datanode should be open.
+    """
+    first_set = [OM[0], DATANODES[0]]
+    second_set = [OM[0], DATANODES[1]]
+    third_set = [OM[0], DATANODES[2]]
+    Blockade.blockade_create_partition(first_set, second_set, third_set)
+    Blockade.blockade_status()
+    ClusterUtils.run_freon(FILE, 1, 1, 1, 10240, "RATIS", "THREE")
+    logger.info("Waiting for %s seconds before checking container status",
+                os.environ["CONTAINER_STATUS_SLEEP"])
+    time.sleep(int(os.environ["CONTAINER_STATUS_SLEEP"]))
+    all_datanodes_container_status = \
+        ClusterUtils.find_all_datanodes_container_status(FILE, SCALE)
+    first_datanode_status = all_datanodes_container_status[0]
+    second_datanode_status = all_datanodes_container_status[1]
+    third_datanode_status = all_datanodes_container_status[2]
+    assert first_datanode_status == 'OPEN'
+    assert second_datanode_status == 'OPEN'
+    assert third_datanode_status == 'OPEN'

+ 120 - 0
hadoop-ozone/dist/src/main/blockade/test_blockade_mixed_failure_two_nodes.py

@@ -0,0 +1,120 @@
+#!/usr/bin/python
+
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import time
+import logging
+from blockadeUtils.blockade import Blockade
+from clusterUtils.cluster_utils import ClusterUtils
+
+
+logger = logging.getLogger(__name__)
+parent_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
+FILE = os.path.join(parent_dir, "compose", "ozoneblockade",
+                    "docker-compose.yaml")
+SCALE = 3
+CONTAINER_LIST = []
+OM = []
+SCM = []
+DATANODES = []
+
+
+def setup():
+    global CONTAINER_LIST, OM, SCM, DATANODES
+    Blockade.blockade_destroy()
+    CONTAINER_LIST = ClusterUtils.cluster_setup(FILE, SCALE)
+    exit_code, output = Blockade.blockade_status()
+    assert exit_code == 0, "blockade status command failed with output=[%s]" % \
+                           output
+    OM = filter(lambda x: 'ozoneManager' in x, CONTAINER_LIST)
+    SCM = filter(lambda x: 'scm' in x, CONTAINER_LIST)
+    DATANODES = sorted(list(filter(lambda x: 'datanode' in x, CONTAINER_LIST)))
+
+    exit_code, output = \
+        ClusterUtils.run_freon(FILE, 1, 1, 1, 10240, "RATIS", "THREE")
+    assert exit_code == 0, "freon run failed with output=[%s]" % output
+
+
+def teardown():
+    logger.info("Inside teardown")
+    Blockade.blockade_destroy()
+
+
+def teardown_module():
+    ClusterUtils.cluster_destroy(FILE)
+
+
+def test_two_dns_isolate_scm_same_partition():
+    """
+    In this test, one of the datanodes (first datanode) cannot communicate
+    with other two datanodes.
+    Two datanodes (second datanode and third datanode), on same network
+    parition, cannot communicate with SCM.
+    Expectation :
+    The container replica state in first datanode should be quasi-closed.
+    The container replica state in second datanode should be open.
+    The container replica state in third datanode should be open.
+    """
+    first_set = [OM[0], DATANODES[1], DATANODES[2]]
+    second_set = [OM[0], SCM[0], DATANODES[0]]
+    Blockade.blockade_create_partition(first_set, second_set)
+    Blockade.blockade_status()
+    ClusterUtils.run_freon(FILE, 1, 1, 1, 10240, "RATIS", "THREE")
+    logger.info("Waiting for %s seconds before checking container status",
+                os.environ["CONTAINER_STATUS_SLEEP"])
+    time.sleep(int(os.environ["CONTAINER_STATUS_SLEEP"]))
+    all_datanodes_container_status = \
+        ClusterUtils.find_all_datanodes_container_status(FILE, SCALE)
+    first_datanode_status = all_datanodes_container_status[0]
+    second_datanode_status = all_datanodes_container_status[1]
+    third_datanode_status = all_datanodes_container_status[2]
+    assert first_datanode_status == 'QUASI_CLOSED'
+    assert second_datanode_status == 'OPEN'
+    assert third_datanode_status == 'OPEN'
+
+
+def test_two_dns_isolate_scm_different_partition():
+    """
+    In this test, one of the datanodes (first datanode) cannot communicate with
+     other two datanodes.
+    Two datanodes (first datanode and second datanode),
+    on different network paritions, cannot communicate with SCM.
+    Expectation :
+    The container replica state in first datanode should be open.
+    The container replica states can be either 'closed'
+    in both second and third datanode, or,
+    'open' in second datanode and 'quasi-closed' in third datanode.
+    """
+    first_set = [OM[0], DATANODES[0]]
+    second_set = [OM[0], DATANODES[1], DATANODES[2]]
+    third_set = [SCM[0], DATANODES[2]]
+    Blockade.blockade_create_partition(first_set, second_set, third_set)
+    Blockade.blockade_status()
+    ClusterUtils.run_freon(FILE, 1, 1, 1, 10240, "RATIS", "THREE")
+    logger.info("Waiting for %s seconds before checking container status",
+                os.environ["CONTAINER_STATUS_SLEEP"])
+    time.sleep(int(os.environ["CONTAINER_STATUS_SLEEP"]))
+    all_datanodes_container_status = \
+        ClusterUtils.find_all_datanodes_container_status(FILE, SCALE)
+    first_datanode_status = all_datanodes_container_status[0]
+    second_datanode_status = all_datanodes_container_status[1]
+    third_datanode_status = all_datanodes_container_status[2]
+    assert first_datanode_status == 'OPEN'
+    assert (second_datanode_status == 'CLOSED' and
+            third_datanode_status == 'CLOSED') or \
+           (second_datanode_status == 'OPEN' and
+            third_datanode_status == 'QUASI_CLOSED')