8 år sedan · 52347d2e4e
--- a/contrib/utils/perf/deploy-gce-perf-cluster.py
+++ b/contrib/utils/perf/deploy-gce-perf-cluster.py
@@ -27,18 +27,20 @@ import traceback
 
				 import re
			
 
				 import socket
			
 
				 
			
 
				-cluster_name="perf-cluster"
			
 
				-ambari_repo_file_url="http://s3.amazonaws.com/dev.hortonworks.com/ambari/centos6/2.x/latest/trunk/ambaribn.repo"
			
 
				+cluster_prefix = "perf"
			
 
				+ambari_repo_file_url = "http://s3.amazonaws.com/dev.hortonworks.com/ambari/centos6/2.x/latest/trunk/ambaribn.repo"
			
 
				 
			
 
				-public_hostname_script="foo"
			
 
				-hostname_script="foo"
			
 
				+public_hostname_script = "foo"
			
 
				+hostname_script = "foo"
			
 
				 
			
 
				-start_number=1
			
 
				-number_of_agents_on_host=50
			
 
				+NUMBER_OF_AGENTS_ON_HOST = 50
			
 
				 
			
 
				 
			
 
				 class SSH:
			
 
				-  """ Ssh implementation of this """
			
 
				+  """
			
 
				+  Ssh implementation of this
			
 
				+  """
			
 
				+
			
 
				   def __init__(self, user, sshkey_file, host, command, custom_option='', errorMessage = None):
			
 
				     self.user = user
			
 
				     self.sshkey_file = sshkey_file
			
@@ -46,8 +48,6 @@ class SSH:
 
				     self.command = command
			
 
				     self.errorMessage = errorMessage
			
 
				     self.custom_option = custom_option
			
 
				-    pass
			
 
				-
			
 
				 
			
 
				   def run(self):
			
 
				     sshcommand = ["ssh",
			
@@ -91,12 +91,15 @@ class SSH:
 
				 
			
 
				     print "SSH command execution finished"
			
 
				 
			
 
				-    return  {"exitstatus": sshstat.returncode, "log": log, "errormsg": errorMsg}
			
 
				+    return {"exitstatus": sshstat.returncode, "log": log, "errormsg": errorMsg}
			
 
				 
			
 
				 
			
 
				 class SCP:
			
 
				-  """ SCP implementation that is thread based. The status can be returned using
			
 
				-   status val """
			
 
				+  """
			
 
				+  SCP implementation that is thread based. The status can be returned using
			
 
				+  status val
			
 
				+  """
			
 
				+
			
 
				   def __init__(self, user, sshkey_file, host, inputFile, remote, errorMessage = None):
			
 
				     self.user = user
			
 
				     self.sshkey_file = sshkey_file
			
@@ -104,8 +107,6 @@ class SCP:
 
				     self.inputFile = inputFile
			
 
				     self.remote = remote
			
 
				     self.errorMessage = errorMessage
			
 
				-    pass
			
 
				-
			
 
				 
			
 
				   def run(self):
			
 
				     scpcommand = ["scp",
			
@@ -115,7 +116,6 @@ class SCP:
 
				                   "-o", "StrictHostKeyChecking=no",
			
 
				                   "-i", self.sshkey_file, self.inputFile, self.user + "@" +
			
 
				                                                           self.host + ":" + self.remote]
			
 
				-
			
 
				     i = 1
			
 
				     while True:
			
 
				       try:
			
@@ -162,27 +162,33 @@ def main():
 
				   parser.add_argument('--key', type=str,
			
 
				                       action='store', help='Path to GCE ssh key.')
			
 
				 
			
 
				-  parser.add_argument('--cluster-prefix', type=str,
			
 
				-                      action='store', help='Cluster name prefix.')
			
 
				+  parser.add_argument('--cluster-suffix', type=str,
			
 
				+                      action='store', help='Cluster name suffix.')
			
 
				 
			
 
				   parser.add_argument('--agent-prefix', type=str,
			
 
				                       action='store', help='Agent name prefix.')
			
 
				 
			
 
				   parser.add_argument('--agents-count', type=int,
			
 
				-                      action='store', help='Agents count for whole cluster(multiples of 50).')
			
 
				+                      action='store', help='Agents count for whole cluster (multiples of 50).')
			
 
				 
			
 
				   if len(sys.argv) <= 1:
			
 
				     parser.print_help()
			
 
				-  else:
			
 
				-    args = parser.parse_args()
			
 
				-    do_work(args)
			
 
				+    sys.exit(-1)
			
 
				+
			
 
				+  args = parser.parse_args()
			
 
				+  do_work(args)
			
 
				 
			
 
				-# base method which process cluster deployment
			
 
				 def deploy_cluster(args):
			
 
				-  number_of_nodes=args.agents_count/number_of_agents_on_host
			
 
				+  """
			
 
				+  Process cluster deployment
			
 
				+  :param args: Command line args.
			
 
				+  """
			
 
				+  # When dividing, need to get the ceil.
			
 
				+  number_of_nodes = ((args.agents_count - 1) / NUMBER_OF_AGENTS_ON_HOST) + 1
			
 
				+
			
 
				   # trying to create cluster with needed params
			
 
				-  print "Creating cluster {0}-{1} with {2} large nodes on centos6...".format(args.cluster_prefix, cluster_name, str(number_of_nodes))
			
 
				-  execute_command(args, args.controller, "/usr/sbin/gce up {0}-{1} {2} --centos6 --large".format(args.cluster_prefix, cluster_name, str(number_of_nodes)),
			
 
				+  print "Creating cluster {0}-{1} with {2} large nodes on centos6...".format(cluster_prefix, args.cluster_suffix, str(number_of_nodes))
			
 
				+  execute_command(args, args.controller, "/usr/sbin/gce up {0}-{1} {2} --centos6 --large".format(cluster_prefix, args.cluster_suffix, str(number_of_nodes)),
			
 
				                   "Failed to create cluster, probably not enough resources!", "-tt")
			
 
				 
			
 
				   # VMs are not accessible immediately
			
@@ -190,19 +196,20 @@ def deploy_cluster(args):
 
				 
			
 
				   # getting list of vms information like hostname and ip address
			
 
				   print "Getting list of virtual machines from cluster..."
			
 
				+  # Dictionary from host name to IP
			
 
				   vms = get_vms_list(args)
			
 
				 
			
 
				   # check number of nodes in cluster to be the same as user asked
			
 
				   print "Checking count of created nodes in cluster..."
			
 
				   if not vms or len(vms) < number_of_nodes:
			
 
				-    raise Exception("Can not bring up enough nodes. Requested {0}, but got: {1}. Probably not enough resources!".format(number_of_nodes, len(vms)))
			
 
				+    raise Exception("Cannot bring up enough nodes. Requested {0}, but got {1}. Probably not enough resources!".format(number_of_nodes, len(vms)))
			
 
				 
			
 
				   print "GCE cluster was successfully created!"
			
 
				   pretty_print_vms(vms)
			
 
				 
			
 
				   # installing/starting ambari-server and ambari-agents on each host
			
 
				   server_host_name = sorted(vms.items())[0][0]
			
 
				-  server_installed=False
			
 
				+  server_installed = False
			
 
				 
			
 
				   print "Creating server.sh script (which will be executed on server to install/configure/start ambari-server and ambari-agent)..."
			
 
				   create_server_script(args, server_host_name)
			
@@ -211,14 +218,33 @@ def deploy_cluster(args):
 
				   create_agent_script(args, server_host_name)
			
 
				 
			
 
				   time.sleep(10)
			
 
				+
			
 
				+  # If the user asks for a number of agents that is not a multiple of 50, then only create how many are needed instead
			
 
				+  # of 50 on every VM.
			
 
				+  num_agents_left_to_create = args.agents_count
			
 
				+
			
 
				+  start_num = 1
			
 
				   for (hostname, ip) in sorted(vms.items()):
			
 
				+    num_agents_on_this_host = min(num_agents_left_to_create, NUMBER_OF_AGENTS_ON_HOST)
			
 
				+
			
 
				     print "=========================="
			
 
				-    print "Working on {0}".format(hostname)
			
 
				+    print "Working on VM {0} that will contain hosts %d - %d".format(hostname, start_num, start_num + num_agents_on_this_host - 1)
			
 
				+
			
 
				+    # The agent multiplier config will be different on each VM.
			
 
				+
			
 
				+    cmd_generate_multiplier_conf = "mkdir -p /etc/ambari-agent/conf/ ; printf \"start={0}\\nnum={1}\\nprefix={2}\" > /etc/ambari-agent/conf/agent-multiplier.conf".format(start_num, num_agents_on_this_host, args.agent_prefix)
			
 
				+    start_num += num_agents_on_this_host
			
 
				+    num_agents_left_to_create -= num_agents_on_this_host
			
 
				+
			
 
				     if not server_installed:
			
 
				       remote_path = "/server.sh"
			
 
				       local_path = "server.sh"
			
 
				       print "Copying server.sh to {0}...".format(hostname)
			
 
				       put_file(args, ip, local_path, remote_path, "Failed to copy file!")
			
 
				+
			
 
				+      print "Generating agent-multiplier.conf"
			
 
				+      execute_command(args, ip, cmd_generate_multiplier_conf, "Failed to generate agent-multiplier.conf on host {0}".format(hostname))
			
 
				+
			
 
				       print "Executing remote ssh command (set correct permissions and start executing server.sh in separate process) on {0}...".format(hostname)
			
 
				       execute_command(args, ip, "cd /; chmod 777 server.sh; nohup ./server.sh >/server.log 2>&1 &",
			
 
				                     "Install/configure/start server script failed!")
			
@@ -228,6 +254,10 @@ def deploy_cluster(args):
 
				       local_path = "agent.sh"
			
 
				       print "Copying agent.sh to {0}...".format(hostname)
			
 
				       put_file(args, ip, local_path, remote_path, "Failed to copy file!")
			
 
				+
			
 
				+      print "Generating agent-multiplier.conf"
			
 
				+      execute_command(args, ip, cmd_generate_multiplier_conf, "Failed to generate agent-multiplier.conf on host {0}".format(hostname))
			
 
				+
			
 
				       print "Executing remote ssh command (set correct permissions and start executing agent.sh in separate process) on {0}...".format(hostname)
			
 
				       execute_command(args, ip, "cd /; chmod 777 agent.sh; nohup ./agent.sh >/agent.log 2>&1 &",
			
 
				                     "Install/configure start agent script failed!")
			
@@ -235,75 +265,90 @@ def deploy_cluster(args):
 
				   print "All scripts where successfully copied and started on all hosts. " \
			
 
				         "\nPay attention that server.sh script need 5 minutes to finish and agent.sh need 3 minutes!"
			
 
				 
			
 
				-# check if all required params were passed by user
			
 
				-# if all needed params available then start cluster deploy
			
 
				+
			
 
				 def do_work(args):
			
 
				+  """
			
 
				+  Check that all required args are passed in. If so, deploy the cluster.
			
 
				+  :param args: Command line args
			
 
				+  """
			
 
				   if not args.controller:
			
 
				-    raise Failure("GCE controller ip address not defined!")
			
 
				+    raise Exception("GCE controller ip address is not defined!")
			
 
				 
			
 
				   if not args.key:
			
 
				-    raise Failure("Path to gce ssh key not defined!")
			
 
				+    raise Exception("Path to gce ssh key is not defined!")
			
 
				 
			
 
				-  if not args.cluster_prefix:
			
 
				-    raise Failure("Cluster name prefix not defined!")
			
 
				+  if not args.cluster_suffix:
			
 
				+    raise Exception("Cluster name suffix is not defined!")
			
 
				 
			
 
				   if not args.agent_prefix:
			
 
				-    raise Failure("Agent name prefix not defined!")
			
 
				+    raise Exception("Agent name prefix is not defined!")
			
 
				 
			
 
				   if not args.agents_count:
			
 
				-    raise Failure("Agents count for whole cluster(multiples of 50) not defined!")
			
 
				+    raise Exception("Agents count for whole cluster is not defined (will put 50 Agents per VM)!")
			
 
				 
			
 
				   deploy_cluster(args)
			
 
				 
			
 
				-# creating server.sh script in the same dir where current script is located
			
 
				-# server.sh script will install, configure and start ambari-server and ambari-agent on host
			
 
				-def create_server_script(args, server_host_name):
			
 
				-  file = open("server.sh", "w")
			
 
				-
			
 
				-  file.write("#!/bin/bash\n")
			
 
				-
			
 
				-  file.write("wget -O /etc/yum.repos.d/ambari.repo {0}\n".format(ambari_repo_file_url))
			
 
				-  file.write("yum clean all; yum install git ambari-server ambari-agent -y\n")
			
 
				-  file.write("cd /home; git clone https://github.com/apache/ambari.git\n")
			
 
				-
			
 
				-  file.write("cp -r /home/ambari/ambari-server/src/main/resources/stacks/PERF /var/lib/ambari-server/resources/stacks/PERF\n")
			
 
				-  file.write("cp -r /home/ambari/ambari-server/src/main/resources/stacks/PERF /var/lib/ambari-agent/cache/stacks/PERF\n")
			
 
				-
			
 
				-  file.write("ambari-server setup -s\n")
			
 
				-  file.write("sed -i -e 's/false/true/g' /var/lib/ambari-server/resources/stacks/PERF/1.0/metainfo.xml\n")
			
 
				-  file.write("ambari-server start --skip-database-check\n")
			
 
				 
			
 
				-  file.write("sed -i -e 's/hostname=localhost/hostname={0}/g' /etc/ambari-agent/conf/ambari-agent.ini\n".format(server_host_name))
			
 
				-  file.write("sed -i -e 's/agent]/agent]\\nhostname_script={0}\\npublic_hostname_script={1}\\n/1' /etc/ambari-agent/conf/ambari-agent.ini\n".format(hostname_script, public_hostname_script))
			
 
				-  file.write("printf \"start={0}\\nnum={1}\\nprefix={2}\" > /etc/ambari-agent/conf/agent-multiplier.conf\n".format(start_number, number_of_agents_on_host, args.agent_prefix))
			
 
				-  file.write("python /home/ambari/ambari-agent/conf/unix/agent-multiplier.py start\n")
			
 
				-
			
 
				-  file.write("exit 0")
			
 
				+def create_server_script(args, server_host_name):
			
 
				+  """
			
 
				+  Creating server.sh script in the same dir where current script is located
			
 
				+  server.sh script will install, configure and start ambari-server and ambari-agent on host
			
 
				+  :param args: Command line args
			
 
				+  :param server_host_name: Server host name
			
 
				+  """
			
 
				+
			
 
				+  contents = "#!/bin/bash\n" + \
			
 
				+  "wget -O /etc/yum.repos.d/ambari.repo {0}\n".format(ambari_repo_file_url) + \
			
 
				+  "yum clean all; yum install git ambari-server ambari-agent -y\n" + \
			
 
				+  "cd /home; git clone https://github.com/apache/ambari.git\n" + \
			
 
				+  "cp -r /home/ambari/ambari-server/src/main/resources/stacks/PERF /var/lib/ambari-server/resources/stacks/PERF\n" + \
			
 
				+  "cp -r /home/ambari/ambari-server/src/main/resources/stacks/PERF /var/lib/ambari-agent/cache/stacks/PERF\n" + \
			
 
				+  "ambari-server setup -s\n" + \
			
 
				+  "sed -i -e 's/false/true/g' /var/lib/ambari-server/resources/stacks/PERF/1.0/metainfo.xml\n" + \
			
 
				+  "ambari-server start --skip-database-check\n" + \
			
 
				+  "sed -i -e 's/hostname=localhost/hostname={0}/g' /etc/ambari-agent/conf/ambari-agent.ini\n".format(server_host_name) + \
			
 
				+  "sed -i -e 's/agent]/agent]\\nhostname_script={0}\\npublic_hostname_script={1}\\n/1' /etc/ambari-agent/conf/ambari-agent.ini\n".format(hostname_script, public_hostname_script) + \
			
 
				+  "python /home/ambari/ambari-agent/conf/unix/agent-multiplier.py start\n" + \
			
 
				+  "exit 0"
			
 
				+
			
 
				+  with open("server.sh", "w") as f:
			
 
				+    f.write(contents)
			
 
				 
			
 
				-  file.close()
			
 
				 
			
 
				-# creating agent.sh script in the same dir where current script is located
			
 
				-# agent.sh script will install, configure and start ambari-agent on host
			
 
				 def create_agent_script(args, server_host_name):
			
 
				-  file = open("agent.sh", "w")
			
 
				-
			
 
				-  file.write("#!/bin/bash\n")
			
 
				-
			
 
				-  file.write("wget -O /etc/yum.repos.d/ambari.repo {0}\n".format(ambari_repo_file_url))
			
 
				-  file.write("yum clean all; yum install git ambari-agent -y\n")
			
 
				-  file.write("cd /home; git clone https://github.com/apache/ambari.git\n")
			
 
				-  file.write("cp -r /home/ambari/ambari-server/src/main/resources/stacks/PERF /var/lib/ambari-agent/cache/stacks/PERF\n")
			
 
				+  """
			
 
				+  Creating agent.sh script in the same dir where current script is located
			
 
				+  agent.sh script will install, configure and start ambari-agent on host
			
 
				+  :param args: Command line args
			
 
				+  :param server_host_name: Server host name
			
 
				+  """
			
 
				+
			
 
				+  # TODO, instead of cloning Ambari repo on each VM, do it on the server once and distribute to all of the agents.
			
 
				+  contents = "#!/bin/bash\n" + \
			
 
				+  "wget -O /etc/yum.repos.d/ambari.repo {0}\n".format(ambari_repo_file_url) + \
			
 
				+  "yum clean all; yum install git ambari-agent -y\n" + \
			
 
				+  "cd /home; git clone https://github.com/apache/ambari.git\n" + \
			
 
				+  "cp -r /home/ambari/ambari-server/src/main/resources/stacks/PERF /var/lib/ambari-agent/cache/stacks/PERF\n" + \
			
 
				+  "sed -i -e 's/hostname=localhost/hostname={0}/g' /etc/ambari-agent/conf/ambari-agent.ini\n".format(server_host_name) + \
			
 
				+  "sed -i -e 's/agent]/agent]\\nhostname_script={0}\\npublic_hostname_script={1}\\n/1' /etc/ambari-agent/conf/ambari-agent.ini\n".format(hostname_script, public_hostname_script) + \
			
 
				+  "python /home/ambari/ambari-agent/conf/unix/agent-multiplier.py start\n" + \
			
 
				+  "exit 0"
			
 
				+
			
 
				+  with open("agent.sh", "w") as f:
			
 
				+    f.write(contents)
			
 
				 
			
 
				-  file.write("sed -i -e 's/hostname=localhost/hostname={0}/g' /etc/ambari-agent/conf/ambari-agent.ini\n".format(server_host_name))
			
 
				-  file.write("sed -i -e 's/agent]/agent]\\nhostname_script={0}\\npublic_hostname_script={1}\\n/1' /etc/ambari-agent/conf/ambari-agent.ini\n".format(hostname_script, public_hostname_script))
			
 
				-  file.write("printf \"start={0}\\nnum={1}\\nprefix={2}\" > /etc/ambari-agent/conf/agent-multiplier.conf\n".format(start_number, number_of_agents_on_host, args.agent_prefix))
			
 
				-  file.write("python /home/ambari/ambari-agent/conf/unix/agent-multiplier.py start\n")
			
 
				-  file.write("exit 0")
			
 
				 
			
 
				-  file.close()
			
 
				-
			
 
				-# method to execute ssh commands via SSH class
			
 
				 def execute_command(args, ip, cmd, fail_message, custom_option='', login='root'):
			
 
				+  """
			
 
				+  Method to execute ssh commands via SSH class
			
 
				+  :param args: Command line args
			
 
				+  :param ip: IP to ssh to
			
 
				+  :param cmd: Command to execute
			
 
				+  :param fail_message: In case of an error, what to report
			
 
				+  :param custom_option: Custom flags
			
 
				+  :param login: Login user
			
 
				+  :return: Return execute log message
			
 
				+  """
			
 
				   ssh = SSH(login, args.key, ip, cmd, custom_option, fail_message)
			
 
				   ssh_result = ssh.run()
			
 
				   status_code = ssh_result["exitstatus"]
			
@@ -312,8 +357,18 @@ def execute_command(args, ip, cmd, fail_message, custom_option='', login='root')
 
				 
			
 
				   return ssh_result["log"][0]
			
 
				 
			
 
				-# method to copy file from local to remote host via SCP class
			
 
				+
			
 
				 def put_file(args, ip, local_file, remote_file, fail_message, login='root'):
			
 
				+  """
			
 
				+  Method to copy file from local to remote host via SCP class
			
 
				+  :param args: Command line args
			
 
				+  :param ip: IP to ssh to
			
 
				+  :param local_file: Path to local file
			
 
				+  :param remote_file: Path to remote file
			
 
				+  :param fail_message: In case of an error, what to report
			
 
				+  :param login: Login user.
			
 
				+  :return: Return copy log message
			
 
				+  """
			
 
				   scp = SCP(login, args.key, ip, local_file,
			
 
				             remote_file, fail_message)
			
 
				   scp_result = scp.run()
			
@@ -323,10 +378,15 @@ def put_file(args, ip, local_file, remote_file, fail_message, login='root'):
 
				 
			
 
				   return scp_result["log"][0]
			
 
				 
			
 
				-# method to parse "gce fqdn {cluster-name}" command output and get hosts and ips pairs for every host in cluster
			
 
				+
			
 
				 def get_vms_list(args):
			
 
				-  gce_fqdb_cmd = '/usr/sbin/gce fqdn {0}-{1}'.format(
			
 
				-    args.cluster_prefix, cluster_name)
			
 
				+  """
			
 
				+  Method to parse "gce fqdn {cluster-name}" command output and get hosts and ips pairs for every host in cluster
			
 
				+  :param args: Command line args
			
 
				+  :return: Mapping of VM host name to ip.
			
 
				+  """
			
 
				+
			
 
				+  gce_fqdb_cmd = '/usr/sbin/gce fqdn {0}-{1}'.format(cluster_prefix, args.cluster_suffix)
			
 
				   out = execute_command(args, args.controller, gce_fqdb_cmd, "Failed to get VMs list!", "-tt")
			
 
				   lines = out.split('\n')
			
 
				   #print "LINES=" + str(lines)
			
@@ -339,15 +399,15 @@ def get_vms_list(args):
 
				       if match:
			
 
				         result[match.group(2)] = match.group(1)
			
 
				       else:
			
 
				-        raise Exception('Can not parse "{0}"'.format(s))
			
 
				+        raise Exception('Cannot parse "{0}"'.format(s))
			
 
				     return result
			
 
				   else:
			
 
				-    raise Exception('Can not parse "{0}"'.format(lines))
			
 
				+    raise Exception('Cannot parse "{0}"'.format(lines))
			
 
				 
			
 
				 
			
 
				 def pretty_print_vms(vms):
			
 
				   print "----------------------------"
			
 
				-  print "server ip: {0}".format(sorted(vms.items())[0][1])
			
 
				+  print "Server IP: {0}".format(sorted(vms.items())[0][1])
			
 
				   print "Hostnames of nodes in cluster:"
			
 
				   for (hostname, ip) in sorted(vms.items()):
			
 
				     print hostname