Преглед изворни кода

AMBARI-12681. Namenode crashes with - No valid image files found (smohanty)

Sumit Mohanty пре 10 година
родитељ
комит
f5155c9139

+ 7 - 0
ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/hdfs_namenode.py

@@ -365,6 +365,13 @@ def bootstrap_standby_namenode(params):
   try:
     iterations = 50
     bootstrap_cmd = "hdfs namenode -bootstrapStandby -nonInteractive"
+    # Blue print based deployments start both NN in parallel and occasionally
+    # the first attempt to bootstrap may fail. Depending on how it fails the
+    # second attempt may not succeed (e.g. it may find the folder and decide that
+    # bootstrap succeeded). The solution is to call with -force option but only
+    # during initial start
+    if params.command_phase == "INITIAL_START":
+      bootstrap_cmd = "hdfs namenode -bootstrapStandby -nonInteractive -force"
     Logger.info("Boostrapping standby namenode: %s" % (bootstrap_cmd))
     for i in range(iterations):
       Logger.info('Try %d out of %d' % (i+1, iterations))

+ 1 - 0
ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/params_linux.py

@@ -133,6 +133,7 @@ falcon_user = config['configurations']['falcon-env']['falcon_user']
 hdfs_exclude_file = default("/clusterHostInfo/decom_dn_hosts", [])
 exclude_file_path = config['configurations']['hdfs-site']['dfs.hosts.exclude']
 update_exclude_file_only = default("/commandParams/update_exclude_file_only",False)
+command_phase = default("/commandParams/phase","")
 
 klist_path_local = get_klist_path(default('/configurations/kerberos-env/executable_search_paths', None))
 kinit_path_local = get_kinit_path(default('/configurations/kerberos-env/executable_search_paths', None))

+ 120 - 4
ambari-server/src/test/python/stacks/2.0.6/HDFS/test_namenode.py

@@ -22,7 +22,7 @@ import json
 import os
 import tempfile
 from stacks.utils.RMFTestCase import *
-from mock.mock import MagicMock, patch
+from mock.mock import MagicMock, patch, call
 import resource_management
 from resource_management.core import shell
 from resource_management.core.exceptions import Fail
@@ -677,14 +677,16 @@ class TestNamenode(RMFTestCase):
   # tests namenode start command when NameNode HA is enabled, and
   # the HA cluster is started initially, rather than using the UI Wizard
   # this test verifies the startup of a "standby" namenode
-  @patch.object(shell, "call", new=MagicMock(return_value=(0,"")))
-  def test_start_ha_bootstrap_standby_from_blueprint(self):
+  @patch.object(shell, "call")
+  def test_start_ha_bootstrap_standby_from_blueprint(self, call_mocks):
+    call_mocks = MagicMock(return_value=(0,""))
     self.executeScript(self.COMMON_SERVICES_PACKAGE_DIR + "/scripts/namenode.py",
                        classname = "NameNode",
                        command = "start",
                        config_file="ha_bootstrap_standby_node.json",
                        hdp_stack_version = self.STACK_VERSION,
-                       target = RMFTestCase.TARGET_COMMON_SERVICES
+                       target = RMFTestCase.TARGET_COMMON_SERVICES,
+                       call_mocks = call_mocks
     )
     self.assert_configure_default()
 
@@ -770,6 +772,120 @@ class TestNamenode(RMFTestCase):
         hadoop_conf_dir = '/etc/hadoop/conf',
     )
     self.assertNoMoreResources()
+    self.assertTrue(call_mocks.called)
+    self.assertEqual(2, call_mocks.call_count)
+    calls = [
+      call('hdfs namenode -bootstrapStandby -nonInteractive', logoutput=False, user=u'hdfs'),
+      call("ambari-sudo.sh su hdfs -l -s /bin/bash -c 'export  PATH=/bin:/usr/bin ; hdfs --config /etc/hadoop/conf haadmin -getServiceState nn2 | grep active'", logoutput=True)]
+    call_mocks.assert_has_calls(calls, any_order=False)
+
+  # tests namenode start command when NameNode HA is enabled, and
+  # the HA cluster is started initially, rather than using the UI Wizard
+  # this test verifies the startup of a "standby" namenode
+  @patch.object(shell, "call")
+  def test_start_ha_bootstrap_standby_from_blueprint_initial_start(self, call_mocks):
+
+    call_mocks = MagicMock()
+    call_mocks.side_effect = [(1, None), (0, None), (0, None)]
+    self.executeScript(self.COMMON_SERVICES_PACKAGE_DIR + "/scripts/namenode.py",
+                       classname = "NameNode",
+                       command = "start",
+                       config_file="ha_bootstrap_standby_node_initial_start.json",
+                       hdp_stack_version = self.STACK_VERSION,
+                       target = RMFTestCase.TARGET_COMMON_SERVICES,
+                       call_mocks = call_mocks
+    )
+    self.assert_configure_default()
+
+    self.assertResourceCalled('File', '/etc/hadoop/conf/dfs.exclude',
+                              owner = 'hdfs',
+                              content = Template('exclude_hosts_list.j2'),
+                              group = 'hadoop',
+                              )
+    self.assertResourceCalled('Directory', '/var/run/hadoop',
+                              owner = 'hdfs',
+                              group = 'hadoop',
+                              mode = 0755
+    )
+
+    # TODO: Using shell.call() to bootstrap standby which is patched to return status code '5' (i.e. already bootstrapped)
+    # Need to update the test case to verify that the standby case is detected, and that the bootstrap
+    # command is run before the namenode launches
+    self.assertResourceCalled('Directory', '/var/run/hadoop/hdfs',
+                              owner = 'hdfs',
+                              recursive = True,
+                              )
+    self.assertResourceCalled('Directory', '/var/log/hadoop/hdfs',
+                              owner = 'hdfs',
+                              recursive = True,
+                              )
+    self.assertResourceCalled('File', '/var/run/hadoop/hdfs/hadoop-hdfs-namenode.pid',
+                              action = ['delete'],
+                              not_if = "ambari-sudo.sh [RMF_ENV_PLACEHOLDER] -H -E test -f /var/run/hadoop/hdfs/hadoop-hdfs-namenode.pid && ambari-sudo.sh [RMF_ENV_PLACEHOLDER] -H -E pgrep -F /var/run/hadoop/hdfs/hadoop-hdfs-namenode.pid",
+                              )
+    self.assertResourceCalled('Execute', "ambari-sudo.sh su hdfs -l -s /bin/bash -c '[RMF_EXPORT_PLACEHOLDER]ulimit -c unlimited ;  /usr/lib/hadoop/sbin/hadoop-daemon.sh --config /etc/hadoop/conf start namenode'",
+                              environment = {'HADOOP_LIBEXEC_DIR': '/usr/lib/hadoop/libexec'},
+                              not_if = "ambari-sudo.sh [RMF_ENV_PLACEHOLDER] -H -E test -f /var/run/hadoop/hdfs/hadoop-hdfs-namenode.pid && ambari-sudo.sh [RMF_ENV_PLACEHOLDER] -H -E pgrep -F /var/run/hadoop/hdfs/hadoop-hdfs-namenode.pid",
+                              )
+    self.assertResourceCalled('Execute', "hdfs dfsadmin -fs hdfs://ns1 -safemode get | grep 'Safe mode is OFF'",
+                              tries=180,
+                              try_sleep=10,
+                              user="hdfs",
+                              logoutput=True
+    )
+    self.assertResourceCalled('HdfsResource', '/tmp',
+                              security_enabled = False,
+                              only_if = "ambari-sudo.sh su hdfs -l -s /bin/bash -c 'export  PATH=/bin:/usr/bin ; hdfs --config /etc/hadoop/conf haadmin -getServiceState nn2 | grep active'",
+                              keytab = UnknownConfigurationMock(),
+                              hadoop_bin_dir = '/usr/bin',
+                              default_fs = 'hdfs://ns1',
+                              hdfs_site = self.getConfig()['configurations']['hdfs-site'],
+                              kinit_path_local = '/usr/bin/kinit',
+                              principal_name = None,
+                              user = 'hdfs',
+                              owner = 'hdfs',
+                              hadoop_conf_dir = '/etc/hadoop/conf',
+                              type = 'directory',
+                              action = ['create_on_execute'],
+                              mode = 0777,
+                              )
+    self.assertResourceCalled('HdfsResource', '/user/ambari-qa',
+                              security_enabled = False,
+                              only_if = "ambari-sudo.sh su hdfs -l -s /bin/bash -c 'export  PATH=/bin:/usr/bin ; hdfs --config /etc/hadoop/conf haadmin -getServiceState nn2 | grep active'",
+                              keytab = UnknownConfigurationMock(),
+                              hadoop_bin_dir = '/usr/bin',
+                              default_fs = 'hdfs://ns1',
+                              hdfs_site = self.getConfig()['configurations']['hdfs-site'],
+                              kinit_path_local = '/usr/bin/kinit',
+                              principal_name = None,
+                              user = 'hdfs',
+                              owner = 'ambari-qa',
+                              hadoop_conf_dir = '/etc/hadoop/conf',
+                              type = 'directory',
+                              action = ['create_on_execute'],
+                              mode = 0770,
+                              )
+    self.assertResourceCalled('HdfsResource', None,
+                              security_enabled = False,
+                              only_if = "ambari-sudo.sh su hdfs -l -s /bin/bash -c 'export  PATH=/bin:/usr/bin ; hdfs --config /etc/hadoop/conf haadmin -getServiceState nn2 | grep active'",
+                              keytab = UnknownConfigurationMock(),
+                              hadoop_bin_dir = '/usr/bin',
+                              default_fs = 'hdfs://ns1',
+                              hdfs_site = self.getConfig()['configurations']['hdfs-site'],
+                              kinit_path_local = '/usr/bin/kinit',
+                              principal_name = None,
+                              user = 'hdfs',
+                              action = ['execute'],
+                              hadoop_conf_dir = '/etc/hadoop/conf',
+                              )
+    self.assertNoMoreResources()
+    self.assertTrue(call_mocks.called)
+    self.assertEqual(3, call_mocks.call_count)
+    calls = [
+      call("ambari-sudo.sh su hdfs -l -s /bin/bash -c 'export  PATH=/bin:/usr/bin ; hdfs --config /etc/hadoop/conf haadmin -getServiceState nn2 | grep active'", logoutput=True),
+      call('hdfs namenode -bootstrapStandby -nonInteractive -force', logoutput=False, user=u'hdfs'),
+      call('hdfs namenode -bootstrapStandby -nonInteractive -force', logoutput=False, user=u'hdfs')]
+    call_mocks.assert_has_calls(calls, any_order=True)
 
   def test_decommission_default(self):
     self.executeScript(self.COMMON_SERVICES_PACKAGE_DIR + "/scripts/namenode.py",

Разлика између датотеке није приказан због своје велике величине
+ 363 - 0
ambari-server/src/test/python/stacks/2.0.6/configs/ha_bootstrap_standby_node_initial_start.json


Неке датотеке нису приказане због велике количине промена