finalizeNodes.php 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422
  1. <?php
  2. include_once '../util/Logger.php';
  3. include_once '../conf/Config.inc';
  4. include_once 'localDirs.php';
  5. include_once "../util/lock.php";
  6. include_once "../util/util.php";
  7. include_once '../db/HMCDBAccessor.php';
  8. include_once "../util/HMCTxnUtils.php";
  9. include_once 'commandUtils.php';
  10. $dbAccessor = new HMCDBAccessor($GLOBALS["DB_PATH"]);
  11. /* If pattern exists in the output return $ret
  12. If pattern is success pattern then $ret = 0
  13. If pattern is error pattern then $ret = 1
  14. If pattern is empty return the output['retcode']
  15. which is the return code of the remote command
  16. */
  17. function check_error ($output, $pattern, $ret) {
  18. $ret1=($ret+1)%2;
  19. for ($i=0; $i<count($output); $i++) {
  20. if (preg_match ("/$pattern/", $output[$i])) {
  21. return $ret;
  22. }
  23. }
  24. return $ret1;
  25. }
  26. /* Sign and verify puppet agent */
  27. function sign_and_verify_agent ($hosts, $logger) {
  28. $origHosts = $hosts;
  29. $totalCnt = count($hosts);
  30. $output = array();
  31. $hostsState = array();
  32. foreach ($hosts as $host) {
  33. $hostsState[$host] = FALSE;
  34. }
  35. $logger->log_info("Starting sign/verify puppet agent for " . $totalCnt
  36. . " nodes, hosts=" . implode(",", $origHosts));
  37. $signed_hosts = array();
  38. $logger->log_info("Getting puppet master list to find all signed agents");
  39. $cmd = "puppet cert --confdir=/etc/puppet/master list --all | grep \"^+ \"";
  40. exec ($cmd, $signed_hosts, $err);
  41. for ($k=0; $k<count($signed_hosts); $k++) {
  42. foreach ($hosts as $i => $host) {
  43. $host = trim($host);
  44. if (preg_match ("/$host/", $signed_hosts[$k])) {
  45. unset($hosts[$i]);
  46. $hostsState[$host] = TRUE;
  47. }
  48. }
  49. }
  50. $waitLoops = 10;
  51. $sleepInterval = 3;
  52. $waitLoop = 0;
  53. $logger->log_info("Looping through until all puppet agents are signed");
  54. for($waitLoop = 0; $waitLoop < $waitLoops; $waitLoop++) {
  55. $waitSecs = $waitLoop * $sleepInterval;
  56. if ($waitLoop > 0) {
  57. $logger->log_info("Waited " . $waitSecs . " seconds for puppet cert sign"
  58. . ", hostsRemaining=" . count($hosts)
  59. . ", totalHosts=" . $totalCnt
  60. . ", totalWaitedTimeSeconds=" . $waitSecs);
  61. }
  62. $unsigned_hosts = array();
  63. $cmd = "puppet cert --confdir=/etc/puppet/master list --all | grep -v \"^+ \"";
  64. exec ($cmd, $unsigned_hosts, $err);
  65. foreach ($hosts as $i => $host) {
  66. $host = trim($host);
  67. for ($j=0; $j<count($unsigned_hosts); $j++) {
  68. if (preg_match ("/$host/", $unsigned_hosts[$j])) {
  69. $logger->log_debug("Signing certificate for ".$host."\n");
  70. $out_arr = array();
  71. $cmd = "puppet cert --confdir=/etc/puppet/master sign $host";
  72. exec ($cmd, $out_arr, $retcode);
  73. if ($retcode != 0) {
  74. $logger->log_error("Failed to sign cert for host " . $host);
  75. $output[$host] =
  76. array ( "discoveryStatus" => "FAILED",
  77. "badHealthReason" => "Puppet cert sign failed: " . implode(";", $out_arr));
  78. $hostsState[$host] = FALSE;
  79. } else {
  80. $logger->log_info("Puppet cert sign succeeded for host " . $host);
  81. $hostsState[$host] = TRUE;
  82. if (isset($output[$host])) {
  83. unset($output[$host]);
  84. }
  85. }
  86. unset($hosts[$i]);
  87. break;
  88. }
  89. }
  90. }
  91. if (empty($hosts)) {
  92. break;
  93. }
  94. sleep($sleepInterval);
  95. }
  96. if ($waitLoop == 10) {
  97. $logger->log_error("Timed out waiting for all puppet agents to ping master");
  98. }
  99. // re-check if the hosts are now signed
  100. $logger->log_info("Re-checking to ensure all puppet hosts are signed");
  101. $signed_hosts = array();
  102. $cmd = "puppet cert --confdir=/etc/puppet/master list --all | grep \"^+ \"";
  103. exec ($cmd, $signed_hosts, $err);
  104. for ($k=0; $k<count($signed_hosts); $k++) {
  105. foreach ($hostsState as $hostName => $state) {
  106. $hostName = trim($hostName);
  107. if (preg_match ("/$hostName/", $signed_hosts[$k])) {
  108. $logger->log_info("Puppet cert signed for host " . $host);
  109. $hostsState[$host] = TRUE;
  110. if (isset($output[$host])) {
  111. unset($output[$host]);
  112. }
  113. }
  114. }
  115. }
  116. $countFailed = 0;
  117. $countSucceeded = 0;
  118. foreach ($hostsState as $hostName => $state) {
  119. if ($state) {
  120. $countSucceeded++;
  121. } else {
  122. $countFailed++;
  123. }
  124. }
  125. $logger->log_info("Puppet cert sign status"
  126. . ", totalHosts=" . $totalCnt
  127. . ", succeededHostsCount=" . $countSucceeded
  128. . ", failedHostsCount=" . $countFailed);
  129. sleep(5);
  130. // run multiple attempts for pings to handle intermittent failures
  131. $pendingNodes = array();
  132. foreach ($origHosts as $i => $host) {
  133. $host = trim($host);
  134. if (array_key_exists ($host , $output)) {
  135. continue;
  136. }
  137. array_push($pendingNodes, $host);
  138. }
  139. $retryAttempt = 0;
  140. do {
  141. $retryAttempt++;
  142. if ($retryAttempt > 1) {
  143. // keep a small sleep between retries
  144. // no sleep on first loop
  145. sleep(3);
  146. }
  147. $logger->log_debug("Puppet kick --ping retry attempt " . $retryAttempt
  148. . ", pendingHoststoCheck=" . implode(",", $pendingNodes));
  149. // Run kick ping in batches of 10 hosts
  150. $hostsToKick = array();
  151. $index = 0;
  152. $counter = 0;
  153. foreach ($pendingNodes as $i => $host) {
  154. $counter++;
  155. if (!isset($hostsToKick[$index])) {
  156. $hostsToKick[$index] = array();
  157. }
  158. $hostsToKick[$index][] = $host;
  159. if ($counter == 10) {
  160. $index++;
  161. $counter = 0;
  162. }
  163. }
  164. $failedNodes = array();
  165. foreach ($hostsToKick as $idx => $hostKickList) {
  166. $hostList = implode(",", $hostKickList);
  167. /* Give puppet kick --ping to check if agent is working */
  168. $logger->log_debug("Puppet kick --ping for batch $idx , hosts=".$hostList);
  169. $hostListStr = "";
  170. foreach ($hostKickList as $hostToKick) {
  171. $hostListStr .= " --host " . $hostToKick;
  172. }
  173. $out_arr = array();
  174. $cmd = "puppet kick -f --parallel 10 --ping $hostListStr 2>/dev/null";
  175. exec ($cmd, $out_arr, $err);
  176. // TODO do we need to check $err ?
  177. $pHostOutput = array();
  178. $pHostResponse = array();
  179. foreach ($out_arr as $line) {
  180. foreach ($hostKickList as $host) {
  181. if (preg_match ("/$host/", $line)) {
  182. if (!isset($pHostOutput[$host])) {
  183. $pHostOutput[$host] = array();
  184. }
  185. $pHostOutput[$host][] = $line;
  186. $pattern = $host." finished with exit code (\d+)";
  187. $matches = array();
  188. if (preg_match("/$pattern/", $line, $matches) > 0) {
  189. $retCode = (int)$matches[1];
  190. $pHostResponse[$host] = $retCode;
  191. }
  192. }
  193. }
  194. }
  195. $logger->log_debug("Output for batch $idx, outputLogs="
  196. . print_r($pHostOutput, true) . " , errorCodes="
  197. . print_r($pHostResponse, true) );
  198. foreach ($hostKickList as $host) {
  199. if (isset($pHostResponse[$host])
  200. && $pHostResponse[$host] == 0) {
  201. $logger->log_info("Puppet kick succeeded for host " . $host);
  202. $hostsState[$host] = TRUE;
  203. if (isset($output[$host])) {
  204. unset($output[$host]);
  205. }
  206. } else {
  207. $logger->log_error("Failed to do puppet kick -ping on host " . $host);
  208. $errorCode = -1;
  209. $failedNodes[] = $host;
  210. if (isset($pHostResponse[$host])) {
  211. $errorCode = $pHostResponse[$host];
  212. }
  213. $errorLogs = "Puppet kick failed";
  214. if (isset($pHostOutput[$host])) {
  215. $errorLogs = implode(";", $pHostOutput[$host]);
  216. }
  217. if (!isset($output[$host])) {
  218. $output[$host] =
  219. array ( "discoveryStatus" => "FAILED",
  220. "badHealthReason" => "Puppet kick failed: "
  221. . ", error=" . $errorCode
  222. . ", outputLogs=" . $errorLogs);
  223. }
  224. $hostsState[$host] = FALSE;
  225. }
  226. }
  227. }
  228. $pendingNodes = $failedNodes;
  229. } while (!empty($pendingNodes) && $retryAttempt < 3);
  230. $countFailed = 0;
  231. $countSucceeded = 0;
  232. foreach ($hostsState as $hostName => $state) {
  233. if ($state) {
  234. $countSucceeded++;
  235. } else {
  236. $countFailed++;
  237. }
  238. }
  239. $logger->log_info("Puppet kick status"
  240. . ", totalHosts=" . $totalCnt
  241. . ", succeededHostsCount=" . $countSucceeded
  242. . ", failedHostsCount=" . $countFailed);
  243. $response = array();
  244. foreach ($hostsState as $host => $state) {
  245. if (!$state) {
  246. if (!isset($output[$host])) {
  247. $logger->log_error("Timed out waiting for puppet agent on host " . $host);
  248. $response[$host] = array ( "discoveryStatus" => "FAILED",
  249. "badHealthReason" => "Puppet cert sign timed out");
  250. } else {
  251. $response[$host] = $output[$host];
  252. }
  253. } else {
  254. $response[$host] = array ( "discoveryStatus" => "SUCCESS",
  255. "badHealthReason" => "");
  256. }
  257. }
  258. $logger->log_info("Completed sign/verify puppet agent for "
  259. . count($response) . " nodes"
  260. . ", result=" . print_r($response, true));
  261. return $response;
  262. }
  263. $clusterName = $argv[1];
  264. $deployUser = $argv[2];
  265. $rootTxnId = $argv[3];
  266. $mySubTxnId = $argv[4];
  267. $parentSubTxnId = $argv[5];
  268. $readFromFile = $argv[6];
  269. $hosts = readHostsFile($readFromFile);
  270. $hosts = convertToLowerCase($hosts);
  271. $totalHosts = count($hosts);
  272. $logger = new HMCLogger("PuppetFinalize:txnId="
  273. . $rootTxnId . ":subTxnId=" . $mySubTxnId);
  274. $logger->log_info("Starting signing of puppet agents certs for "
  275. . count($hosts) . " hosts");
  276. $opStatus = "STARTED";
  277. $subTransactionReturnValue = $dbAccessor->updateSubTransactionOpStatus($clusterName, $parentSubTxnId, $mySubTxnId, $opStatus);
  278. if ($subTransactionReturnValue["result"] != 0 ) {
  279. $logger->log_error("Got error while updating subTxn: ".$subTransactionReturnValue["error"]);
  280. print json_encode($subTransactionReturnValue);
  281. return;
  282. }
  283. // Create progress files for UI to track
  284. $operationName = "finalizeNodes";
  285. $clusterDir = getClusterDir($clusterName);
  286. $myDir = $clusterDir . $operationName . "/";
  287. if (is_dir($myDir)) {
  288. rrmdir($myDir);
  289. }
  290. mkdir($myDir);
  291. foreach ($hosts as $host) {
  292. $fileName = $myDir . "/" . $host . ".out";
  293. $h = fopen($fileName, "a");
  294. if ($h !== FALSE) {
  295. fclose($h);
  296. }
  297. }
  298. $result = sign_and_verify_agent ($hosts,$logger);
  299. $logger->log_debug("Puppet Cert Sign Result:\n".print_r($result, true));
  300. $nodeFileOut = fopen($readFromFile, "w");
  301. if ($nodeFileOut == FALSE) {
  302. $subTransactionReturnValue = $dbAccessor->updateSubTransactionOpStatus($clusterName, $parentSubTxnId, $mySubTxnId, "TOTALFAILURE");
  303. $logger->log_error("Got error while trying to rewrite hosts file");
  304. return;
  305. }
  306. $updateHosts = array();
  307. $failedHosts = 0;
  308. $successfulHosts = 0;
  309. foreach ($result as $hostName => $hostInfo) {
  310. $fileName = $myDir . "/" . $hostName . ".done";
  311. $errFileName = $myDir . "/" . $hostName . ".err";
  312. if ($hostInfo["discoveryStatus"] == "FAILED") {
  313. $updateHosts[$hostName] = $hostInfo;
  314. $errorString = $hostInfo["badHealthReason"];
  315. $f = fopen($errFileName, "w");
  316. if ($f !== FALSE) {
  317. for ($written = 0; $written < strlen($errorString);) {
  318. $writtenBytes = fwrite($f, substr($errorString, $written));
  319. if ($writtenBytes === FALSE) {
  320. $logger->log_error("Failed to write error file for puppet cert sign failure"
  321. . ", host=" . $hostName
  322. . ", errFile=" . $errFileName
  323. . ", error=" . $errorString);
  324. break;
  325. }
  326. $written += $writtenBytes;
  327. }
  328. fflush($f);
  329. fclose($f);
  330. } else {
  331. $logger->log_error("Failed to write error file for puppet cert sign failure"
  332. . ", host=" . $hostName
  333. . ", errFile=" . $errFileName
  334. . ", error=" . $errorString);
  335. }
  336. system("echo \"1\" > " . $fileName);
  337. $failedHosts++;
  338. } else {
  339. system("echo \"0\" > " . $fileName);
  340. // write the nodename to the readFromFile file.
  341. fwrite($nodeFileOut, $hostName."\n");
  342. $successfulHosts++;
  343. }
  344. }
  345. fclose($nodeFileOut);
  346. $logger->log_debug("Updating DB for hosts discovery status for puppet agent cert signing");
  347. $ret = $dbAccessor->updateHostDiscoveryStatus($clusterName, $updateHosts);
  348. if ($ret["result"] != 0) {
  349. $logger->log_error("Failed to update DB for hosts status, error="
  350. . $ret["error"]);
  351. // TODO - handle failure?
  352. }
  353. $opStatus = "SUCCESS";
  354. if ($totalHosts > 0) {
  355. if ($successfulHosts == 0) {
  356. $opStatus = "TOTALFAILURE";
  357. } else if ($failedHosts > 0) {
  358. $opStatus = "FAILED";
  359. }
  360. }
  361. $logger->log_info("Puppet finalize, succeeded for " . $successfulHosts
  362. . " and failed for " . $failedHosts . " of total " . $totalHosts . " hosts");
  363. $subTransactionReturnValue = $dbAccessor->updateSubTransactionOpStatus($clusterName, $parentSubTxnId, $mySubTxnId, $opStatus);
  364. if ($subTransactionReturnValue["result"] != 0 ) {
  365. $logger->log_error("Got error while updating subTxn: ".$subTransactionReturnValue["error"]);
  366. print json_encode($subTransactionReturnValue);
  367. return;
  368. }
  369. $logger->log_info("Completed signing of certs for puppet agents, opStatus=" . $opStatus);
  370. ?>