HostInfo.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451
  1. #!/usr/bin/env python
  2. '''
  3. Licensed to the Apache Software Foundation (ASF) under one
  4. or more contributor license agreements. See the NOTICE file
  5. distributed with this work for additional information
  6. regarding copyright ownership. The ASF licenses this file
  7. to you under the Apache License, Version 2.0 (the
  8. "License"); you may not use this file except in compliance
  9. with the License. You may obtain a copy of the License at
  10. http://www.apache.org/licenses/LICENSE-2.0
  11. Unless required by applicable law or agreed to in writing, software
  12. distributed under the License is distributed on an "AS IS" BASIS,
  13. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. See the License for the specific language governing permissions and
  15. limitations under the License.
  16. '''
  17. import glob
  18. import logging
  19. import os
  20. import re
  21. import shlex
  22. import socket
  23. import subprocess
  24. import time
  25. from ambari_commons import OSCheck, OSConst
  26. from ambari_commons.firewall import Firewall
  27. from ambari_commons.os_family_impl import OsFamilyImpl
  28. from ambari_agent.HostCheckReportFileHandler import HostCheckReportFileHandler
  29. logger = logging.getLogger()
  30. # service cmd
  31. SERVICE_CMD = "service"
  32. class HostInfo(object):
  33. # Filters used to identify processed
  34. PROC_FILTER = [
  35. "hadoop", "zookeeper"
  36. ]
  37. RESULT_UNAVAILABLE = "unable_to_determine"
  38. current_umask = -1
  39. def __init__(self, config=None):
  40. self.config = config
  41. self.reportFileHandler = HostCheckReportFileHandler(config)
  42. def dirType(self, path):
  43. if not os.path.exists(path):
  44. return 'not_exist'
  45. elif os.path.islink(path):
  46. return 'sym_link'
  47. elif os.path.isdir(path):
  48. return 'directory'
  49. elif os.path.isfile(path):
  50. return 'file'
  51. return 'unknown'
  52. def checkLiveServices(self, services, result):
  53. for service in services:
  54. svcCheckResult = {}
  55. svcCheckResult['name'] = " or ".join(service)
  56. svcCheckResult['status'] = "UNKNOWN"
  57. svcCheckResult['desc'] = ""
  58. try:
  59. out = ""
  60. err = ""
  61. for serviceName in service:
  62. sys_out, sys_err, code = self.getServiceStatus(serviceName)
  63. if code == 0:
  64. break
  65. out += sys_out if len(out) == 0 else os.linesep + sys_out
  66. err += sys_err if len(err) == 0 else os.linesep + sys_err
  67. if 0 != code:
  68. svcCheckResult['status'] = "Unhealthy"
  69. svcCheckResult['desc'] = out
  70. if len(out) == 0:
  71. svcCheckResult['desc'] = err
  72. else:
  73. svcCheckResult['status'] = "Healthy"
  74. except Exception, e:
  75. svcCheckResult['status'] = "Unhealthy"
  76. svcCheckResult['desc'] = repr(e)
  77. result.append(svcCheckResult)
  78. def getUMask(self):
  79. if (self.current_umask == -1):
  80. self.current_umask = os.umask(self.current_umask)
  81. os.umask(self.current_umask)
  82. return self.current_umask
  83. else:
  84. return self.current_umask
  85. def checkFirewall(self):
  86. return Firewall().getFirewallObject().check_firewall()
  87. def getFirewallName(self):
  88. return Firewall().getFirewallObject().get_firewall_name()
  89. def checkReverseLookup(self):
  90. """
  91. Check if host fqdn resolves to current host ip
  92. """
  93. try:
  94. host_name = socket.gethostname().lower()
  95. host_ip = socket.gethostbyname(host_name)
  96. host_fqdn = socket.getfqdn().lower()
  97. fqdn_ip = socket.gethostbyname(host_fqdn)
  98. return host_ip == fqdn_ip
  99. except socket.error:
  100. pass
  101. return False
  102. def get_ntp_service():
  103. if OSCheck.is_redhat_family():
  104. return ("ntpd", "chronyd",)
  105. elif OSCheck.is_suse_family():
  106. return ("ntpd", "ntp",)
  107. elif OSCheck.is_ubuntu_family():
  108. return ("ntp", "chrony",)
  109. @OsFamilyImpl(os_family=OsFamilyImpl.DEFAULT)
  110. class HostInfoLinux(HostInfo):
  111. # List of project names to be used to find alternatives folders etc.
  112. DEFAULT_PROJECT_NAMES = [
  113. "hadoop*", "hadoop", "hbase", "hcatalog", "hive",
  114. "oozie", "sqoop", "hue", "zookeeper", "mapred", "hdfs", "flume",
  115. "storm", "hive-hcatalog", "tez", "falcon", "ambari_qa", "hadoop_deploy",
  116. "rrdcached", "hcat", "ambari-qa", "sqoop-ambari-qa", "sqoop-ambari_qa",
  117. "webhcat", "hadoop-hdfs", "hadoop-yarn", "hadoop-mapreduce",
  118. "knox", "yarn", "hive-webhcat", "kafka", "slider", "storm-slider-client",
  119. "mahout", "spark", "pig", "phoenix", "ranger", "accumulo",
  120. "ambari-metrics-collector", "ambari-metrics-monitor", "atlas", "zeppelin"
  121. ]
  122. # List of live services checked for on the host, takes a map of plan strings
  123. DEFAULT_LIVE_SERVICES = [
  124. get_ntp_service()
  125. ]
  126. # Set of default users (need to be replaced with the configured user names)
  127. DEFAULT_USERS = [
  128. "hive", "ambari-qa", "oozie", "hbase", "hcat", "mapred",
  129. "hdfs", "zookeeper", "flume", "sqoop", "sqoop2",
  130. "hue", "yarn", "tez", "storm", "falcon", "kafka", "knox", "ams",
  131. "hadoop", "spark", "accumulo", "atlas", "mahout", "ranger", "kms", "zeppelin"
  132. ]
  133. # Default set of directories that are checked for existence of files and folders
  134. DEFAULT_BASEDIRS = [
  135. "/etc", "/var/run", "/var/log", "/usr/lib", "/var/lib", "/var/tmp", "/tmp", "/var",
  136. "/hadoop", "/usr/hdp"
  137. ]
  138. # Exact directories names which are checked for existance
  139. EXACT_DIRECTORIES = [
  140. "/kafka-logs"
  141. ]
  142. DEFAULT_SERVICE_NAME = "ntpd"
  143. SERVICE_STATUS_CMD = "%s %s status" % (SERVICE_CMD, DEFAULT_SERVICE_NAME)
  144. THP_FILE_REDHAT = "/sys/kernel/mm/redhat_transparent_hugepage/enabled"
  145. THP_FILE_UBUNTU = "/sys/kernel/mm/transparent_hugepage/enabled"
  146. def __init__(self, config=None):
  147. super(HostInfoLinux, self).__init__(config)
  148. def checkUsers(self, users, results):
  149. f = open('/etc/passwd', 'r')
  150. for userLine in f:
  151. fields = userLine.split(":")
  152. if fields[0] in users:
  153. result = {}
  154. result['name'] = fields[0]
  155. result['homeDir'] = fields[5]
  156. result['status'] = "Available"
  157. results.append(result)
  158. def checkFolders(self, basePaths, projectNames, exactDirectories, existingUsers, dirs):
  159. foldersToIgnore = []
  160. for user in existingUsers:
  161. foldersToIgnore.append(user['homeDir'])
  162. try:
  163. for dirName in basePaths:
  164. for project in projectNames:
  165. path = os.path.join(dirName.strip(), project.strip())
  166. if not path in foldersToIgnore and os.path.exists(path):
  167. obj = {}
  168. obj['type'] = self.dirType(path)
  169. obj['name'] = path
  170. dirs.append(obj)
  171. for path in exactDirectories:
  172. if os.path.exists(path):
  173. obj = {}
  174. obj['type'] = self.dirType(path)
  175. obj['name'] = path
  176. dirs.append(obj)
  177. except:
  178. logger.exception("Checking folders failed")
  179. def javaProcs(self, list):
  180. import pwd
  181. try:
  182. pids = [pid for pid in os.listdir('/proc') if pid.isdigit()]
  183. for pid in pids:
  184. try:
  185. fp = open(os.path.join('/proc', pid, 'cmdline'), 'rb')
  186. except IOError:
  187. continue # avoid race condition if this process already died, since the moment we got pids list.
  188. cmd = fp.read()
  189. fp.close()
  190. cmd = cmd.replace('\0', ' ')
  191. if not 'AmbariServer' in cmd:
  192. if 'java' in cmd:
  193. dict = {}
  194. dict['pid'] = int(pid)
  195. dict['hadoop'] = False
  196. for filter in self.PROC_FILTER:
  197. if filter in cmd:
  198. dict['hadoop'] = True
  199. dict['command'] = unicode(cmd.strip(), errors='ignore')
  200. for line in open(os.path.join('/proc', pid, 'status')):
  201. if line.startswith('Uid:'):
  202. uid = int(line.split()[1])
  203. dict['user'] = pwd.getpwuid(uid).pw_name
  204. list.append(dict)
  205. except:
  206. logger.exception("Checking java processes failed")
  207. pass
  208. def getTransparentHugePage(self):
  209. thp_regex = "\[(.+)\]"
  210. file_name = None
  211. if OSCheck.is_ubuntu_family():
  212. file_name = self.THP_FILE_UBUNTU
  213. elif OSCheck.is_redhat_family():
  214. file_name = self.THP_FILE_REDHAT
  215. if file_name and os.path.isfile(file_name):
  216. with open(file_name) as f:
  217. file_content = f.read()
  218. return re.search(thp_regex, file_content).groups()[0]
  219. else:
  220. return ""
  221. def hadoopVarRunCount(self):
  222. if not os.path.exists('/var/run/hadoop'):
  223. return 0
  224. pids = glob.glob('/var/run/hadoop/*/*.pid')
  225. return len(pids)
  226. def hadoopVarLogCount(self):
  227. if not os.path.exists('/var/log/hadoop'):
  228. return 0
  229. logs = glob.glob('/var/log/hadoop/*/*.log')
  230. return len(logs)
  231. def etcAlternativesConf(self, projects, etcResults):
  232. if not os.path.exists('/etc/alternatives'):
  233. return []
  234. projectRegex = "'" + '|'.join(projects) + "'"
  235. files = [f for f in os.listdir('/etc/alternatives') if re.match(projectRegex, f)]
  236. for conf in files:
  237. result = {}
  238. filePath = os.path.join('/etc/alternatives', conf)
  239. if os.path.islink(filePath):
  240. realConf = os.path.realpath(filePath)
  241. result['name'] = conf
  242. result['target'] = realConf
  243. etcResults.append(result)
  244. def register(self, dict, componentsMapped=True, commandsInProgress=True):
  245. """ Return various details about the host
  246. componentsMapped: indicates if any components are mapped to this host
  247. commandsInProgress: indicates if any commands are in progress
  248. """
  249. dict['hostHealth'] = {}
  250. java = []
  251. self.javaProcs(java)
  252. dict['hostHealth']['activeJavaProcs'] = java
  253. liveSvcs = []
  254. self.checkLiveServices(self.DEFAULT_LIVE_SERVICES, liveSvcs)
  255. dict['hostHealth']['liveServices'] = liveSvcs
  256. dict['umask'] = str(self.getUMask())
  257. dict['transparentHugePage'] = self.getTransparentHugePage()
  258. dict['firewallRunning'] = self.checkFirewall()
  259. dict['firewallName'] = self.getFirewallName()
  260. dict['reverseLookup'] = self.checkReverseLookup()
  261. # If commands are in progress or components are already mapped to this host
  262. # Then do not perform certain expensive host checks
  263. if componentsMapped or commandsInProgress:
  264. dict['alternatives'] = []
  265. dict['stackFoldersAndFiles'] = []
  266. dict['existingUsers'] = []
  267. else:
  268. etcs = []
  269. self.etcAlternativesConf(self.DEFAULT_PROJECT_NAMES, etcs)
  270. dict['alternatives'] = etcs
  271. existingUsers = []
  272. self.checkUsers(self.DEFAULT_USERS, existingUsers)
  273. dict['existingUsers'] = existingUsers
  274. dirs = []
  275. self.checkFolders(self.DEFAULT_BASEDIRS, self.DEFAULT_PROJECT_NAMES, self.EXACT_DIRECTORIES, existingUsers, dirs)
  276. dict['stackFoldersAndFiles'] = dirs
  277. self.reportFileHandler.writeHostCheckFile(dict)
  278. pass
  279. # The time stamp must be recorded at the end
  280. dict['hostHealth']['agentTimeStampAtReporting'] = int(time.time() * 1000)
  281. pass
  282. def getServiceStatus(self, serivce_name):
  283. service_check_live = shlex.split(self.SERVICE_STATUS_CMD)
  284. service_check_live[1] = serivce_name
  285. osStat = subprocess.Popen(service_check_live, stdout=subprocess.PIPE,
  286. stderr=subprocess.PIPE)
  287. out, err = osStat.communicate()
  288. return out, err, osStat.returncode
  289. @OsFamilyImpl(os_family=OSConst.WINSRV_FAMILY)
  290. class HostInfoWindows(HostInfo):
  291. SERVICE_STATUS_CMD = 'If ((Get-Service | Where-Object {{$_.Name -eq \'{0}\'}}).Status -eq \'Running\') {{echo "Running"; $host.SetShouldExit(0)}} Else {{echo "Stopped"; $host.SetShouldExit(1)}}'
  292. GET_USERS_CMD = '$accounts=(Get-WmiObject -Class Win32_UserAccount -Namespace "root\cimv2" -Filter "name = \'{0}\' and Disabled=\'False\'" -ErrorAction Stop); foreach ($acc in $accounts) {{Write-Host ($acc.Domain + "\\" + $acc.Name)}}'
  293. GET_JAVA_PROC_CMD = 'foreach ($process in (gwmi Win32_Process -Filter "name = \'java.exe\'")){{echo $process.ProcessId;echo $process.CommandLine; echo $process.GetOwner().User}}'
  294. DEFAULT_LIVE_SERVICES = [
  295. ("W32Time",)
  296. ]
  297. DEFAULT_USERS = "hadoop"
  298. def checkUsers(self, user_mask, results):
  299. get_users_cmd = ["powershell", '-noProfile', '-NonInteractive', '-nologo', "-Command", self.GET_USERS_CMD.format(user_mask)]
  300. try:
  301. osStat = subprocess.Popen(get_users_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
  302. out, err = osStat.communicate()
  303. except:
  304. raise Exception("Failed to get users.")
  305. for user in out.split(os.linesep):
  306. result = {}
  307. result['name'] = user
  308. result['homeDir'] = ""
  309. result['status'] = "Available"
  310. results.append(result)
  311. def createAlerts(self, alerts):
  312. # TODO AMBARI-7849 Implement createAlerts for Windows
  313. return alerts
  314. def javaProcs(self, list):
  315. try:
  316. from ambari_commons.os_windows import run_powershell_script
  317. code, out, err = run_powershell_script(self.GET_JAVA_PROC_CMD)
  318. if code == 0:
  319. splitted_output = out.split(os.linesep)
  320. for i in [index for index in range(0, len(splitted_output)) if (index % 3) == 0]:
  321. pid = splitted_output[i]
  322. cmd = splitted_output[i + 1]
  323. user = splitted_output[i + 2]
  324. if not 'AmbariServer' in cmd:
  325. if 'java' in cmd:
  326. dict = {}
  327. dict['pid'] = int(pid)
  328. dict['hadoop'] = False
  329. for filter in self.PROC_FILTER:
  330. if filter in cmd:
  331. dict['hadoop'] = True
  332. dict['command'] = cmd.strip()
  333. dict['user'] = user
  334. list.append(dict)
  335. except Exception as e:
  336. pass
  337. pass
  338. def getServiceStatus(self, serivce_name):
  339. from ambari_commons.os_windows import run_powershell_script
  340. code, out, err = run_powershell_script(self.SERVICE_STATUS_CMD.format(serivce_name))
  341. return out, err, code
  342. def register(self, dict, componentsMapped=True, commandsInProgress=True):
  343. """ Return various details about the host
  344. componentsMapped: indicates if any components are mapped to this host
  345. commandsInProgress: indicates if any commands are in progress
  346. """
  347. dict['hostHealth'] = {}
  348. java = []
  349. self.javaProcs(java)
  350. dict['hostHealth']['activeJavaProcs'] = java
  351. liveSvcs = []
  352. self.checkLiveServices(self.DEFAULT_LIVE_SERVICES, liveSvcs)
  353. dict['hostHealth']['liveServices'] = liveSvcs
  354. dict['umask'] = str(self.getUMask())
  355. dict['firewallRunning'] = self.checkFirewall()
  356. dict['firewallName'] = self.getFirewallName()
  357. dict['reverseLookup'] = self.checkReverseLookup()
  358. # If commands are in progress or components are already mapped to this host
  359. # Then do not perform certain expensive host checks
  360. if componentsMapped or commandsInProgress:
  361. dict['alternatives'] = []
  362. dict['stackFoldersAndFiles'] = []
  363. dict['existingUsers'] = []
  364. else:
  365. existingUsers = []
  366. self.checkUsers(self.DEFAULT_USERS, existingUsers)
  367. dict['existingUsers'] = existingUsers
  368. # TODO check HDP stack and folders here
  369. self.reportFileHandler.writeHostCheckFile(dict)
  370. pass
  371. # The time stamp must be recorded at the end
  372. dict['hostHealth']['agentTimeStampAtReporting'] = int(time.time() * 1000)
  373. def main(argv=None):
  374. h = HostInfo()
  375. struct = {}
  376. h.register(struct)
  377. print struct
  378. if __name__ == '__main__':
  379. main()