HostInfo.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496
  1. #!/usr/bin/env python
  2. '''
  3. Licensed to the Apache Software Foundation (ASF) under one
  4. or more contributor license agreements. See the NOTICE file
  5. distributed with this work for additional information
  6. regarding copyright ownership. The ASF licenses this file
  7. to you under the Apache License, Version 2.0 (the
  8. "License"); you may not use this file except in compliance
  9. with the License. You may obtain a copy of the License at
  10. http://www.apache.org/licenses/LICENSE-2.0
  11. Unless required by applicable law or agreed to in writing, software
  12. distributed under the License is distributed on an "AS IS" BASIS,
  13. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. See the License for the specific language governing permissions and
  15. limitations under the License.
  16. '''
  17. import glob
  18. import hostname
  19. import logging
  20. import os
  21. import re
  22. import shlex
  23. import socket
  24. import subprocess
  25. import time
  26. from ambari_commons import OSCheck, OSConst
  27. from ambari_commons.firewall import Firewall
  28. from ambari_commons.os_family_impl import OsFamilyImpl
  29. from resource_management.libraries.functions import packages_analyzer
  30. from ambari_agent.Hardware import Hardware
  31. from ambari_agent.HostCheckReportFileHandler import HostCheckReportFileHandler
  32. logger = logging.getLogger()
  33. # service cmd
  34. SERVICE_CMD = "service"
  35. class HostInfo(object):
  36. # Filters used to identify processed
  37. PROC_FILTER = [
  38. "hadoop", "zookeeper"
  39. ]
  40. RESULT_UNAVAILABLE = "unable_to_determine"
  41. current_umask = -1
  42. def __init__(self, config=None):
  43. self.config = config
  44. self.reportFileHandler = HostCheckReportFileHandler(config)
  45. def dirType(self, path):
  46. if not os.path.exists(path):
  47. return 'not_exist'
  48. elif os.path.islink(path):
  49. return 'sym_link'
  50. elif os.path.isdir(path):
  51. return 'directory'
  52. elif os.path.isfile(path):
  53. return 'file'
  54. return 'unknown'
  55. def checkLiveServices(self, services, result):
  56. osType = OSCheck.get_os_family()
  57. for service in services:
  58. svcCheckResult = {}
  59. if isinstance(service, dict):
  60. serviceName = service[osType]
  61. else:
  62. serviceName = service
  63. svcCheckResult['name'] = serviceName
  64. svcCheckResult['status'] = "UNKNOWN"
  65. svcCheckResult['desc'] = ""
  66. try:
  67. out, err, code = self.getServiceStatus(serviceName)
  68. if 0 != code:
  69. svcCheckResult['status'] = "Unhealthy"
  70. svcCheckResult['desc'] = out
  71. if len(out) == 0:
  72. svcCheckResult['desc'] = err
  73. else:
  74. svcCheckResult['status'] = "Healthy"
  75. except Exception, e:
  76. svcCheckResult['status'] = "Unhealthy"
  77. svcCheckResult['desc'] = repr(e)
  78. result.append(svcCheckResult)
  79. def getUMask(self):
  80. if (self.current_umask == -1):
  81. self.current_umask = os.umask(self.current_umask)
  82. os.umask(self.current_umask)
  83. return self.current_umask
  84. else:
  85. return self.current_umask
  86. def checkReverseLookup(self):
  87. """
  88. Check if host fqdn resolves to current host ip
  89. """
  90. try:
  91. host_name = socket.gethostname().lower()
  92. host_ip = socket.gethostbyname(host_name)
  93. host_fqdn = socket.getfqdn().lower()
  94. fqdn_ip = socket.gethostbyname(host_fqdn)
  95. return host_ip == fqdn_ip
  96. except socket.error:
  97. pass
  98. return False
  99. @OsFamilyImpl(os_family=OsFamilyImpl.DEFAULT)
  100. class HostInfoLinux(HostInfo):
  101. # List of project names to be used to find alternatives folders etc.
  102. DEFAULT_PROJECT_NAMES = [
  103. "hadoop*", "hadoop", "hbase", "hcatalog", "hive", "ganglia",
  104. "oozie", "sqoop", "hue", "zookeeper", "mapred", "hdfs", "flume",
  105. "storm", "hive-hcatalog", "tez", "falcon", "ambari_qa", "hadoop_deploy",
  106. "rrdcached", "hcat", "ambari-qa", "sqoop-ambari-qa", "sqoop-ambari_qa",
  107. "webhcat", "hadoop-hdfs", "hadoop-yarn", "hadoop-mapreduce"
  108. ]
  109. # List of live services checked for on the host, takes a map of plan strings
  110. DEFAULT_LIVE_SERVICES = [
  111. {OSConst.REDHAT_FAMILY: "ntpd", OSConst.SUSE_FAMILY: "ntp", OSConst.UBUNTU_FAMILY: "ntp"}
  112. ]
  113. # Set of default users (need to be replaced with the configured user names)
  114. DEFAULT_USERS = [
  115. "hive", "ambari-qa", "oozie", "hbase", "hcat", "mapred",
  116. "hdfs", "rrdcached", "zookeeper", "flume", "sqoop", "sqoop2",
  117. "hue", "yarn", "tez", "storm", "falcon", "kafka", "knox"
  118. ]
  119. # Default set of directories that are checked for existence of files and folders
  120. DEFAULT_DIRS = [
  121. "/etc", "/var/run", "/var/log", "/usr/lib", "/var/lib", "/var/tmp", "/tmp", "/var",
  122. "/hadoop", "/usr/hdp"
  123. ]
  124. # Packages that are used to find repos (then repos are used to find other packages)
  125. PACKAGES = [
  126. "hadoop_2_2_*", "hadoop-2-2-.*", "zookeeper_2_2_*", "zookeeper-2-2-.*",
  127. "hadoop", "zookeeper", "webhcat", "*-manager-server-db", "*-manager-daemons"
  128. ]
  129. # Additional packages to look for (search packages that start with these)
  130. ADDITIONAL_PACKAGES = [
  131. "rrdtool", "rrdtool-python", "ganglia", "gmond", "gweb", "libconfuse",
  132. "ambari-log4j", "hadoop", "zookeeper", "oozie", "webhcat"
  133. ]
  134. # ignore packages from repos whose names start with these strings
  135. IGNORE_PACKAGES_FROM_REPOS = [
  136. "ambari", "installed"
  137. ]
  138. # ignore required packages
  139. IGNORE_PACKAGES = [
  140. "epel-release"
  141. ]
  142. # ignore repos from the list of repos to be cleaned
  143. IGNORE_REPOS = [
  144. "ambari", "HDP-UTILS"
  145. ]
  146. DEFAULT_SERVICE_NAME = "ntpd"
  147. SERVICE_STATUS_CMD = "%s %s status" % (SERVICE_CMD, DEFAULT_SERVICE_NAME)
  148. THP_FILE = "/sys/kernel/mm/redhat_transparent_hugepage/enabled"
  149. def __init__(self, config=None):
  150. super(HostInfoLinux, self).__init__(config)
  151. def osdiskAvailableSpace(self, path):
  152. diskInfo = {}
  153. try:
  154. df = subprocess.Popen(["df", "-kPT", path], stdout=subprocess.PIPE)
  155. dfdata = df.communicate()[0]
  156. return Hardware.extractMountInfo(dfdata.splitlines()[-1])
  157. except:
  158. pass
  159. return diskInfo
  160. def checkUsers(self, users, results):
  161. f = open('/etc/passwd', 'r')
  162. for userLine in f:
  163. fields = userLine.split(":")
  164. if fields[0] in users:
  165. result = {}
  166. homeDir = fields[5]
  167. result['name'] = fields[0]
  168. result['homeDir'] = fields[5]
  169. result['status'] = "Available"
  170. if not os.path.exists(homeDir):
  171. result['status'] = "Invalid home directory"
  172. results.append(result)
  173. def checkFolders(self, basePaths, projectNames, existingUsers, dirs):
  174. foldersToIgnore = []
  175. for user in existingUsers:
  176. foldersToIgnore.append(user['homeDir'])
  177. try:
  178. for dirName in basePaths:
  179. for project in projectNames:
  180. path = os.path.join(dirName.strip(), project.strip())
  181. if not path in foldersToIgnore and os.path.exists(path):
  182. obj = {}
  183. obj['type'] = self.dirType(path)
  184. obj['name'] = path
  185. dirs.append(obj)
  186. except:
  187. pass
  188. def javaProcs(self, list):
  189. import pwd
  190. try:
  191. pids = [pid for pid in os.listdir('/proc') if pid.isdigit()]
  192. for pid in pids:
  193. cmd = open(os.path.join('/proc', pid, 'cmdline'), 'rb').read()
  194. cmd = cmd.replace('\0', ' ')
  195. if not 'AmbariServer' in cmd:
  196. if 'java' in cmd:
  197. dict = {}
  198. dict['pid'] = int(pid)
  199. dict['hadoop'] = False
  200. for filter in self.PROC_FILTER:
  201. if filter in cmd:
  202. dict['hadoop'] = True
  203. dict['command'] = cmd.strip()
  204. for line in open(os.path.join('/proc', pid, 'status')):
  205. if line.startswith('Uid:'):
  206. uid = int(line.split()[1])
  207. dict['user'] = pwd.getpwuid(uid).pw_name
  208. list.append(dict)
  209. except:
  210. pass
  211. pass
  212. def getReposToRemove(self, repos, ignoreList):
  213. reposToRemove = []
  214. for repo in repos:
  215. addToRemoveList = True
  216. for ignoreRepo in ignoreList:
  217. if packages_analyzer.nameMatch(ignoreRepo, repo):
  218. addToRemoveList = False
  219. continue
  220. if addToRemoveList:
  221. reposToRemove.append(repo)
  222. return reposToRemove
  223. def getTransparentHugePage(self):
  224. # This file exist only on redhat 6
  225. thp_regex = "\[(.+)\]"
  226. if os.path.isfile(self.THP_FILE):
  227. with open(self.THP_FILE) as f:
  228. file_content = f.read()
  229. return re.search(thp_regex, file_content).groups()[0]
  230. else:
  231. return ""
  232. def checkIptables(self):
  233. return Firewall().getFirewallObject().check_iptables()
  234. def hadoopVarRunCount(self):
  235. if not os.path.exists('/var/run/hadoop'):
  236. return 0
  237. pids = glob.glob('/var/run/hadoop/*/*.pid')
  238. return len(pids)
  239. def hadoopVarLogCount(self):
  240. if not os.path.exists('/var/log/hadoop'):
  241. return 0
  242. logs = glob.glob('/var/log/hadoop/*/*.log')
  243. return len(logs)
  244. def etcAlternativesConf(self, projects, etcResults):
  245. if not os.path.exists('/etc/alternatives'):
  246. return []
  247. projectRegex = "'" + '|'.join(projects) + "'"
  248. files = [f for f in os.listdir('/etc/alternatives') if re.match(projectRegex, f)]
  249. for conf in files:
  250. result = {}
  251. filePath = os.path.join('/etc/alternatives', conf)
  252. if os.path.islink(filePath):
  253. realConf = os.path.realpath(filePath)
  254. result['name'] = conf
  255. result['target'] = realConf
  256. etcResults.append(result)
  257. def register(self, dict, componentsMapped=True, commandsInProgress=True):
  258. """ Return various details about the host
  259. componentsMapped: indicates if any components are mapped to this host
  260. commandsInProgress: indicates if any commands are in progress
  261. """
  262. dict['hostHealth'] = {}
  263. java = []
  264. self.javaProcs(java)
  265. dict['hostHealth']['activeJavaProcs'] = java
  266. liveSvcs = []
  267. self.checkLiveServices(self.DEFAULT_LIVE_SERVICES, liveSvcs)
  268. dict['hostHealth']['liveServices'] = liveSvcs
  269. dict['umask'] = str(self.getUMask())
  270. dict['transparentHugePage'] = self.getTransparentHugePage()
  271. dict['iptablesIsRunning'] = self.checkIptables()
  272. dict['reverseLookup'] = self.checkReverseLookup()
  273. # If commands are in progress or components are already mapped to this host
  274. # Then do not perform certain expensive host checks
  275. if componentsMapped or commandsInProgress:
  276. dict['existingRepos'] = [self.RESULT_UNAVAILABLE]
  277. dict['installedPackages'] = []
  278. dict['alternatives'] = []
  279. dict['stackFoldersAndFiles'] = []
  280. dict['existingUsers'] = []
  281. else:
  282. etcs = []
  283. self.etcAlternativesConf(self.DEFAULT_PROJECT_NAMES, etcs)
  284. dict['alternatives'] = etcs
  285. existingUsers = []
  286. self.checkUsers(self.DEFAULT_USERS, existingUsers)
  287. dict['existingUsers'] = existingUsers
  288. dirs = []
  289. self.checkFolders(self.DEFAULT_DIRS, self.DEFAULT_PROJECT_NAMES, existingUsers, dirs)
  290. dict['stackFoldersAndFiles'] = dirs
  291. installedPackages = []
  292. availablePackages = []
  293. packages_analyzer.allInstalledPackages(installedPackages)
  294. packages_analyzer.allAvailablePackages(availablePackages)
  295. repos = []
  296. packages_analyzer.getInstalledRepos(self.PACKAGES, installedPackages + availablePackages,
  297. self.IGNORE_PACKAGES_FROM_REPOS, repos)
  298. packagesInstalled = packages_analyzer.getInstalledPkgsByRepo(repos, self.IGNORE_PACKAGES, installedPackages)
  299. additionalPkgsInstalled = packages_analyzer.getInstalledPkgsByNames(
  300. self.ADDITIONAL_PACKAGES, installedPackages)
  301. allPackages = list(set(packagesInstalled + additionalPkgsInstalled))
  302. dict['installedPackages'] = packages_analyzer.getPackageDetails(installedPackages, allPackages)
  303. repos = self.getReposToRemove(repos, self.IGNORE_REPOS)
  304. dict['existingRepos'] = repos
  305. self.reportFileHandler.writeHostCheckFile(dict)
  306. pass
  307. # The time stamp must be recorded at the end
  308. dict['hostHealth']['agentTimeStampAtReporting'] = int(time.time() * 1000)
  309. pass
  310. def getServiceStatus(self, serivce_name):
  311. service_check_live = shlex.split(self.SERVICE_STATUS_CMD)
  312. service_check_live[1] = serivce_name
  313. osStat = subprocess.Popen(service_check_live, stdout=subprocess.PIPE,
  314. stderr=subprocess.PIPE)
  315. out, err = osStat.communicate()
  316. return out, err, osStat.returncode
  317. @OsFamilyImpl(os_family=OSConst.WINSRV_FAMILY)
  318. class HostInfoWindows(HostInfo):
  319. SERVICE_STATUS_CMD = 'If ((Get-Service | Where-Object {{$_.Name -eq \'{0}\'}}).Status -eq \'Running\') {{echo "Running"; $host.SetShouldExit(0)}} Else {{echo "Stopped"; $host.SetShouldExit(1)}}'
  320. GET_USERS_CMD = '$accounts=(Get-WmiObject -Class Win32_UserAccount -Namespace "root\cimv2" -Filter "LocalAccount=\'$True\'" -ComputerName "LocalHost" -ErrorAction Stop); foreach ($acc in $accounts) {echo $acc.Name}'
  321. GET_JAVA_PROC_CMD = 'foreach ($process in (gwmi Win32_Process -Filter "name = \'java.exe\'")){echo $process.ProcessId;echo $process.CommandLine; echo $process.GetOwner().User}'
  322. DEFAULT_LIVE_SERVICES = [
  323. {OSConst.WINSRV_FAMILY: "W32Time"}
  324. ]
  325. DEFAULT_USERS = ["hadoop"]
  326. def checkUsers(self, users, results):
  327. get_users_cmd = ["powershell", '-noProfile', '-NonInteractive', '-nologo', "-Command", self.GET_USERS_CMD]
  328. try:
  329. osStat = subprocess.Popen(get_users_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
  330. out, err = osStat.communicate()
  331. except:
  332. raise Exception("Failed to get users.")
  333. for user in out.split(os.linesep):
  334. if user in users:
  335. result = {}
  336. result['name'] = user
  337. result['status'] = "Available"
  338. results.append(result)
  339. def checkIptables(self):
  340. from ambari_commons.os_windows import run_powershell_script, CHECK_FIREWALL_SCRIPT
  341. out = run_powershell_script(CHECK_FIREWALL_SCRIPT)
  342. if out[0] != 0:
  343. logger.warn("Unable to check firewall status:{0}".format(out[2]))
  344. return False
  345. profiles_status = [i for i in out[1].split("\n") if not i == ""]
  346. if "1" in profiles_status:
  347. return True
  348. return False
  349. def createAlerts(self, alerts):
  350. # TODO AMBARI-7849 Implement createAlerts for Windows
  351. return alerts
  352. def javaProcs(self, list):
  353. try:
  354. from ambari_commons.os_windows import run_powershell_script
  355. code, out, err = run_powershell_script(self.GET_JAVA_PROC_CMD)
  356. if code == 0:
  357. splitted_output = out.split(os.linesep)
  358. for i in [index for index in range(0, len(splitted_output)) if (index % 3) == 0]:
  359. pid = splitted_output[i]
  360. cmd = splitted_output[i + 1]
  361. user = splitted_output[i + 2]
  362. if not 'AmbariServer' in cmd:
  363. if 'java' in cmd:
  364. dict = {}
  365. dict['pid'] = int(pid)
  366. dict['hadoop'] = False
  367. for filter in self.PROC_FILTER:
  368. if filter in cmd:
  369. dict['hadoop'] = True
  370. dict['command'] = cmd.strip()
  371. dict['user'] = user
  372. list.append(dict)
  373. except Exception as e:
  374. pass
  375. pass
  376. def getServiceStatus(self, serivce_name):
  377. from ambari_commons.os_windows import run_powershell_script
  378. code, out, err = run_powershell_script(self.SERVICE_STATUS_CMD.format(serivce_name))
  379. return out, err, code
  380. def register(self, dict, componentsMapped=True, commandsInProgress=True):
  381. """ Return various details about the host
  382. componentsMapped: indicates if any components are mapped to this host
  383. commandsInProgress: indicates if any commands are in progress
  384. """
  385. dict['hostHealth'] = {}
  386. java = []
  387. self.javaProcs(java)
  388. dict['hostHealth']['activeJavaProcs'] = java
  389. liveSvcs = []
  390. self.checkLiveServices(self.DEFAULT_LIVE_SERVICES, liveSvcs)
  391. dict['hostHealth']['liveServices'] = liveSvcs
  392. dict['umask'] = str(self.getUMask())
  393. dict['iptablesIsRunning'] = self.checkIptables()
  394. dict['reverseLookup'] = self.checkReverseLookup()
  395. # If commands are in progress or components are already mapped to this host
  396. # Then do not perform certain expensive host checks
  397. if componentsMapped or commandsInProgress:
  398. dict['existingRepos'] = [self.RESULT_UNAVAILABLE]
  399. dict['installedPackages'] = []
  400. dict['alternatives'] = []
  401. dict['stackFoldersAndFiles'] = []
  402. dict['existingUsers'] = []
  403. else:
  404. existingUsers = []
  405. self.checkUsers(self.DEFAULT_USERS, existingUsers)
  406. dict['existingUsers'] = existingUsers
  407. # TODO check HDP stack and folders here
  408. self.reportFileHandler.writeHostCheckFile(dict)
  409. pass
  410. # The time stamp must be recorded at the end
  411. dict['hostHealth']['agentTimeStampAtReporting'] = int(time.time() * 1000)
  412. def main(argv=None):
  413. h = HostInfo()
  414. struct = {}
  415. h.register(struct)
  416. print struct
  417. if __name__ == '__main__':
  418. main()