HostInfo.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432
  1. #!/usr/bin/env python
  2. '''
  3. Licensed to the Apache Software Foundation (ASF) under one
  4. or more contributor license agreements. See the NOTICE file
  5. distributed with this work for additional information
  6. regarding copyright ownership. The ASF licenses this file
  7. to you under the Apache License, Version 2.0 (the
  8. "License"); you may not use this file except in compliance
  9. with the License. You may obtain a copy of the License at
  10. http://www.apache.org/licenses/LICENSE-2.0
  11. Unless required by applicable law or agreed to in writing, software
  12. distributed under the License is distributed on an "AS IS" BASIS,
  13. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. See the License for the specific language governing permissions and
  15. limitations under the License.
  16. '''
  17. import glob
  18. import hostname
  19. import logging
  20. import os
  21. import re
  22. import shlex
  23. import socket
  24. import subprocess
  25. import time
  26. from ambari_commons import OSCheck, OSConst
  27. from ambari_commons.firewall import Firewall
  28. from ambari_commons.os_family_impl import OsFamilyImpl
  29. from resource_management.libraries.functions import packages_analyzer
  30. from ambari_agent.Hardware import Hardware
  31. from ambari_agent.HostCheckReportFileHandler import HostCheckReportFileHandler
  32. logger = logging.getLogger()
  33. # service cmd
  34. SERVICE_CMD = "service"
  35. class HostInfo(object):
  36. # Filters used to identify processed
  37. PROC_FILTER = [
  38. "hadoop", "zookeeper"
  39. ]
  40. RESULT_UNAVAILABLE = "unable_to_determine"
  41. current_umask = -1
  42. def __init__(self, config=None):
  43. self.config = config
  44. self.reportFileHandler = HostCheckReportFileHandler(config)
  45. def dirType(self, path):
  46. if not os.path.exists(path):
  47. return 'not_exist'
  48. elif os.path.islink(path):
  49. return 'sym_link'
  50. elif os.path.isdir(path):
  51. return 'directory'
  52. elif os.path.isfile(path):
  53. return 'file'
  54. return 'unknown'
  55. def checkLiveServices(self, services, result):
  56. osType = OSCheck.get_os_family()
  57. for service in services:
  58. svcCheckResult = {}
  59. serviceName = service
  60. svcCheckResult['name'] = serviceName
  61. svcCheckResult['status'] = "UNKNOWN"
  62. svcCheckResult['desc'] = ""
  63. try:
  64. out, err, code = self.getServiceStatus(serviceName)
  65. if 0 != code:
  66. svcCheckResult['status'] = "Unhealthy"
  67. svcCheckResult['desc'] = out
  68. if len(out) == 0:
  69. svcCheckResult['desc'] = err
  70. else:
  71. svcCheckResult['status'] = "Healthy"
  72. except Exception, e:
  73. svcCheckResult['status'] = "Unhealthy"
  74. svcCheckResult['desc'] = repr(e)
  75. result.append(svcCheckResult)
  76. def getUMask(self):
  77. if (self.current_umask == -1):
  78. self.current_umask = os.umask(self.current_umask)
  79. os.umask(self.current_umask)
  80. return self.current_umask
  81. else:
  82. return self.current_umask
  83. def checkFirewall(self):
  84. return Firewall().getFirewallObject().check_firewall()
  85. def getFirewallName(self):
  86. return Firewall().getFirewallObject().get_firewall_name()
  87. def checkReverseLookup(self):
  88. """
  89. Check if host fqdn resolves to current host ip
  90. """
  91. try:
  92. host_name = socket.gethostname().lower()
  93. host_ip = socket.gethostbyname(host_name)
  94. host_fqdn = socket.getfqdn().lower()
  95. fqdn_ip = socket.gethostbyname(host_fqdn)
  96. return host_ip == fqdn_ip
  97. except socket.error:
  98. pass
  99. return False
  100. def get_ntp_service():
  101. if OSCheck.is_redhat_family():
  102. return "ntpd"
  103. elif OSCheck.is_suse_family() or OSCheck.is_ubuntu_family():
  104. return "ntp"
  105. @OsFamilyImpl(os_family=OsFamilyImpl.DEFAULT)
  106. class HostInfoLinux(HostInfo):
  107. # List of project names to be used to find alternatives folders etc.
  108. DEFAULT_PROJECT_NAMES = [
  109. "hadoop*", "hadoop", "hbase", "hcatalog", "hive", "ganglia",
  110. "oozie", "sqoop", "hue", "zookeeper", "mapred", "hdfs", "flume",
  111. "storm", "hive-hcatalog", "tez", "falcon", "ambari_qa", "hadoop_deploy",
  112. "rrdcached", "hcat", "ambari-qa", "sqoop-ambari-qa", "sqoop-ambari_qa",
  113. "webhcat", "hadoop-hdfs", "hadoop-yarn", "hadoop-mapreduce",
  114. "knox", "yarn", "hive-webhcat", "kafka", "slider", "storm-slider-client",
  115. "ganglia-web"
  116. ]
  117. # List of live services checked for on the host, takes a map of plan strings
  118. DEFAULT_LIVE_SERVICES = [
  119. get_ntp_service()
  120. ]
  121. # Set of default users (need to be replaced with the configured user names)
  122. DEFAULT_USERS = [
  123. "hive", "ambari-qa", "oozie", "hbase", "hcat", "mapred",
  124. "hdfs", "rrdcached", "zookeeper", "flume", "sqoop", "sqoop2",
  125. "hue", "yarn", "tez", "storm", "falcon", "kafka", "knox"
  126. ]
  127. # Default set of directories that are checked for existence of files and folders
  128. DEFAULT_DIRS = [
  129. "/etc", "/var/run", "/var/log", "/usr/lib", "/var/lib", "/var/tmp", "/tmp", "/var",
  130. "/hadoop", "/usr/hdp"
  131. ]
  132. DEFAULT_SERVICE_NAME = "ntpd"
  133. SERVICE_STATUS_CMD = "%s %s status" % (SERVICE_CMD, DEFAULT_SERVICE_NAME)
  134. THP_FILE = "/sys/kernel/mm/redhat_transparent_hugepage/enabled"
  135. def __init__(self, config=None):
  136. super(HostInfoLinux, self).__init__(config)
  137. def osdiskAvailableSpace(self, path):
  138. diskInfo = {}
  139. try:
  140. df = subprocess.Popen(["df", "-kPT", path], stdout=subprocess.PIPE)
  141. dfdata = df.communicate()[0]
  142. return Hardware.extractMountInfo(dfdata.splitlines()[-1])
  143. except:
  144. pass
  145. return diskInfo
  146. def checkUsers(self, users, results):
  147. f = open('/etc/passwd', 'r')
  148. for userLine in f:
  149. fields = userLine.split(":")
  150. if fields[0] in users:
  151. result = {}
  152. homeDir = fields[5]
  153. result['name'] = fields[0]
  154. result['homeDir'] = fields[5]
  155. result['status'] = "Available"
  156. results.append(result)
  157. def checkFolders(self, basePaths, projectNames, existingUsers, dirs):
  158. foldersToIgnore = []
  159. for user in existingUsers:
  160. foldersToIgnore.append(user['homeDir'])
  161. try:
  162. for dirName in basePaths:
  163. for project in projectNames:
  164. path = os.path.join(dirName.strip(), project.strip())
  165. if not path in foldersToIgnore and os.path.exists(path):
  166. obj = {}
  167. obj['type'] = self.dirType(path)
  168. obj['name'] = path
  169. dirs.append(obj)
  170. except:
  171. pass
  172. def javaProcs(self, list):
  173. import pwd
  174. try:
  175. pids = [pid for pid in os.listdir('/proc') if pid.isdigit()]
  176. for pid in pids:
  177. cmd = open(os.path.join('/proc', pid, 'cmdline'), 'rb').read()
  178. cmd = cmd.replace('\0', ' ')
  179. if not 'AmbariServer' in cmd:
  180. if 'java' in cmd:
  181. dict = {}
  182. dict['pid'] = int(pid)
  183. dict['hadoop'] = False
  184. for filter in self.PROC_FILTER:
  185. if filter in cmd:
  186. dict['hadoop'] = True
  187. dict['command'] = cmd.strip()
  188. for line in open(os.path.join('/proc', pid, 'status')):
  189. if line.startswith('Uid:'):
  190. uid = int(line.split()[1])
  191. dict['user'] = pwd.getpwuid(uid).pw_name
  192. list.append(dict)
  193. except:
  194. pass
  195. pass
  196. def getTransparentHugePage(self):
  197. # This file exist only on redhat 6
  198. thp_regex = "\[(.+)\]"
  199. if os.path.isfile(self.THP_FILE):
  200. with open(self.THP_FILE) as f:
  201. file_content = f.read()
  202. return re.search(thp_regex, file_content).groups()[0]
  203. else:
  204. return ""
  205. def hadoopVarRunCount(self):
  206. if not os.path.exists('/var/run/hadoop'):
  207. return 0
  208. pids = glob.glob('/var/run/hadoop/*/*.pid')
  209. return len(pids)
  210. def hadoopVarLogCount(self):
  211. if not os.path.exists('/var/log/hadoop'):
  212. return 0
  213. logs = glob.glob('/var/log/hadoop/*/*.log')
  214. return len(logs)
  215. def etcAlternativesConf(self, projects, etcResults):
  216. if not os.path.exists('/etc/alternatives'):
  217. return []
  218. projectRegex = "'" + '|'.join(projects) + "'"
  219. files = [f for f in os.listdir('/etc/alternatives') if re.match(projectRegex, f)]
  220. for conf in files:
  221. result = {}
  222. filePath = os.path.join('/etc/alternatives', conf)
  223. if os.path.islink(filePath):
  224. realConf = os.path.realpath(filePath)
  225. result['name'] = conf
  226. result['target'] = realConf
  227. etcResults.append(result)
  228. def register(self, dict, componentsMapped=True, commandsInProgress=True):
  229. """ Return various details about the host
  230. componentsMapped: indicates if any components are mapped to this host
  231. commandsInProgress: indicates if any commands are in progress
  232. """
  233. dict['hostHealth'] = {}
  234. java = []
  235. self.javaProcs(java)
  236. dict['hostHealth']['activeJavaProcs'] = java
  237. liveSvcs = []
  238. self.checkLiveServices(self.DEFAULT_LIVE_SERVICES, liveSvcs)
  239. dict['hostHealth']['liveServices'] = liveSvcs
  240. dict['umask'] = str(self.getUMask())
  241. dict['transparentHugePage'] = self.getTransparentHugePage()
  242. dict['firewallRunning'] = self.checkFirewall()
  243. dict['firewallName'] = self.getFirewallName()
  244. dict['reverseLookup'] = self.checkReverseLookup()
  245. # If commands are in progress or components are already mapped to this host
  246. # Then do not perform certain expensive host checks
  247. if componentsMapped or commandsInProgress:
  248. dict['alternatives'] = []
  249. dict['stackFoldersAndFiles'] = []
  250. dict['existingUsers'] = []
  251. else:
  252. etcs = []
  253. self.etcAlternativesConf(self.DEFAULT_PROJECT_NAMES, etcs)
  254. dict['alternatives'] = etcs
  255. existingUsers = []
  256. self.checkUsers(self.DEFAULT_USERS, existingUsers)
  257. dict['existingUsers'] = existingUsers
  258. dirs = []
  259. self.checkFolders(self.DEFAULT_DIRS, self.DEFAULT_PROJECT_NAMES, existingUsers, dirs)
  260. dict['stackFoldersAndFiles'] = dirs
  261. self.reportFileHandler.writeHostCheckFile(dict)
  262. pass
  263. # The time stamp must be recorded at the end
  264. dict['hostHealth']['agentTimeStampAtReporting'] = int(time.time() * 1000)
  265. pass
  266. def getServiceStatus(self, serivce_name):
  267. service_check_live = shlex.split(self.SERVICE_STATUS_CMD)
  268. service_check_live[1] = serivce_name
  269. osStat = subprocess.Popen(service_check_live, stdout=subprocess.PIPE,
  270. stderr=subprocess.PIPE)
  271. out, err = osStat.communicate()
  272. return out, err, osStat.returncode
  273. @OsFamilyImpl(os_family=OSConst.WINSRV_FAMILY)
  274. class HostInfoWindows(HostInfo):
  275. SERVICE_STATUS_CMD = 'If ((Get-Service | Where-Object {{$_.Name -eq \'{0}\'}}).Status -eq \'Running\') {{echo "Running"; $host.SetShouldExit(0)}} Else {{echo "Stopped"; $host.SetShouldExit(1)}}'
  276. GET_USERS_CMD = '$accounts=(Get-WmiObject -Class Win32_UserAccount -Namespace "root\cimv2" -Filter "LocalAccount=\'$True\'" -ComputerName "LocalHost" -ErrorAction Stop); foreach ($acc in $accounts) {echo $acc.Name}'
  277. GET_JAVA_PROC_CMD = 'foreach ($process in (gwmi Win32_Process -Filter "name = \'java.exe\'")){echo $process.ProcessId;echo $process.CommandLine; echo $process.GetOwner().User}'
  278. DEFAULT_LIVE_SERVICES = [
  279. "W32Time"
  280. ]
  281. DEFAULT_USERS = ["hadoop"]
  282. def checkUsers(self, users, results):
  283. get_users_cmd = ["powershell", '-noProfile', '-NonInteractive', '-nologo', "-Command", self.GET_USERS_CMD]
  284. try:
  285. osStat = subprocess.Popen(get_users_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
  286. out, err = osStat.communicate()
  287. except:
  288. raise Exception("Failed to get users.")
  289. for user in out.split(os.linesep):
  290. if user in users:
  291. result = {}
  292. result['name'] = user
  293. result['status'] = "Available"
  294. results.append(result)
  295. def createAlerts(self, alerts):
  296. # TODO AMBARI-7849 Implement createAlerts for Windows
  297. return alerts
  298. def javaProcs(self, list):
  299. try:
  300. from ambari_commons.os_windows import run_powershell_script
  301. code, out, err = run_powershell_script(self.GET_JAVA_PROC_CMD)
  302. if code == 0:
  303. splitted_output = out.split(os.linesep)
  304. for i in [index for index in range(0, len(splitted_output)) if (index % 3) == 0]:
  305. pid = splitted_output[i]
  306. cmd = splitted_output[i + 1]
  307. user = splitted_output[i + 2]
  308. if not 'AmbariServer' in cmd:
  309. if 'java' in cmd:
  310. dict = {}
  311. dict['pid'] = int(pid)
  312. dict['hadoop'] = False
  313. for filter in self.PROC_FILTER:
  314. if filter in cmd:
  315. dict['hadoop'] = True
  316. dict['command'] = cmd.strip()
  317. dict['user'] = user
  318. list.append(dict)
  319. except Exception as e:
  320. pass
  321. pass
  322. def getServiceStatus(self, serivce_name):
  323. from ambari_commons.os_windows import run_powershell_script
  324. code, out, err = run_powershell_script(self.SERVICE_STATUS_CMD.format(serivce_name))
  325. return out, err, code
  326. def register(self, dict, componentsMapped=True, commandsInProgress=True):
  327. """ Return various details about the host
  328. componentsMapped: indicates if any components are mapped to this host
  329. commandsInProgress: indicates if any commands are in progress
  330. """
  331. dict['hostHealth'] = {}
  332. java = []
  333. self.javaProcs(java)
  334. dict['hostHealth']['activeJavaProcs'] = java
  335. liveSvcs = []
  336. self.checkLiveServices(self.DEFAULT_LIVE_SERVICES, liveSvcs)
  337. dict['hostHealth']['liveServices'] = liveSvcs
  338. dict['umask'] = str(self.getUMask())
  339. dict['firewallRunning'] = self.checkFirewall()
  340. dict['firewallName'] = self.getFirewallName()
  341. dict['reverseLookup'] = self.checkReverseLookup()
  342. # If commands are in progress or components are already mapped to this host
  343. # Then do not perform certain expensive host checks
  344. if componentsMapped or commandsInProgress:
  345. dict['alternatives'] = []
  346. dict['stackFoldersAndFiles'] = []
  347. dict['existingUsers'] = []
  348. else:
  349. existingUsers = []
  350. self.checkUsers(self.DEFAULT_USERS, existingUsers)
  351. dict['existingUsers'] = existingUsers
  352. # TODO check HDP stack and folders here
  353. self.reportFileHandler.writeHostCheckFile(dict)
  354. pass
  355. # The time stamp must be recorded at the end
  356. dict['hostHealth']['agentTimeStampAtReporting'] = int(time.time() * 1000)
  357. def main(argv=None):
  358. h = HostInfo()
  359. struct = {}
  360. h.register(struct)
  361. print struct
  362. if __name__ == '__main__':
  363. main()