Forráskód Böngészése

HADOOP-952. Create a public (shared) Hadoop EC2 AMI.

git-svn-id: https://svn.apache.org/repos/asf/lucene/hadoop/trunk@510224 13f79535-47bb-0310-9956-ffa450edef68
Thomas White 18 éve
szülő
commit
8f165b5fd7

+ 5 - 1
CHANGES.txt

@@ -86,9 +86,13 @@ Trunk (unreleased changes)
     directly, so that user code is no longer required in the
     JobTracker.  (omalley via cutting)
 
-26. HAOOP-1006.  Remove obsolete '-local' option from test code.
+26. HADOOP-1006.  Remove obsolete '-local' option from test code.
     (Gautam Kowshik via cutting)
 
+27. HADOOP-952. Create a public (shared) Hadoop EC2 AMI.
+    The EC2 scripts now support launch of public AMIs.
+    (tomwhite)
+    
 
 Release 0.11.2 - 2007-02-16
 

+ 4 - 0
src/contrib/ec2/README.txt

@@ -3,6 +3,10 @@ Hadoop EC2
 This collection of scripts allows you to run Hadoop clusters on Amazon.com's Elastic Compute Cloud (EC2) service described at:
 
   http://aws.amazon.com/ec2
+  
+To get help, type the following in a shell:
+  
+  bin/hadoop-ec2
 
 For full instructions, please visit the Hadoop wiki at:
 

+ 4 - 4
src/contrib/ec2/bin/create-hadoop-image

@@ -35,17 +35,17 @@ echo "Copying scripts."
 
 # Copy setup scripts
 scp $SSH_OPTS "$bin"/hadoop-ec2-env.sh "root@$HOSTNAME:"
-scp $SSH_OPTS "$bin"/image/hadoop-init "root@$HOSTNAME:/etc/init.d/hadoop-init"
+scp $SSH_OPTS "$bin"/image/hadoop-init "root@$HOSTNAME:"
 scp $SSH_OPTS "$bin"/image/create-hadoop-image-remote "root@$HOSTNAME:"
 
 # Copy private key and certificate (for bundling image)
-scp $SSH_OPTS $EC2_KEYDIR/pk-*.pem "root@$HOSTNAME:"
-scp $SSH_OPTS $EC2_KEYDIR/cert-*.pem "root@$HOSTNAME:"
+scp $SSH_OPTS $EC2_KEYDIR/pk-*.pem "root@$HOSTNAME:/mnt"
+scp $SSH_OPTS $EC2_KEYDIR/cert-*.pem "root@$HOSTNAME:/mnt"
 
 # Connect to it
 ssh $SSH_OPTS "root@$HOSTNAME" './create-hadoop-image-remote'
 
 # Register image
-ec2-register $S3_BUCKET/image.manifest.xml
+ec2-register $S3_BUCKET/hadoop-$HADOOP_VERSION.manifest.xml
 
 echo "Terminate with: ec2-terminate-instances $BOOTING_INSTANCE"

+ 36 - 0
src/contrib/ec2/bin/hadoop-ec2

@@ -0,0 +1,36 @@
+#!/bin/sh
+
+bin=`dirname "$0"`
+bin=`cd "$bin"; pwd`
+
+# if no args specified, show usage
+if [ $# = 0 ]; then
+  echo "Usage: hadoop-ec2 COMMAND"
+  echo "where COMMAND is one of:"
+  echo "  create-image         create a Hadoop AMI"
+  echo "  launch-cluster       launch a cluster of Hadoop EC2 instances"
+  echo "  start-hadoop         start Hadoop daemons on a cluster"
+  echo "  login                login to the master node of the Hadoop EC2 cluster"
+  echo "  run                  'launch-cluster', 'start-hadoop', 'login'"
+  echo "  terminate-cluster    terminate a cluster of Hadoop EC2 instances"
+  exit 1
+fi
+
+# get arguments
+COMMAND=$1
+shift
+
+if [ "$COMMAND" = "create-image" ] ; then
+  . "$bin"/create-hadoop-image
+elif [ "$COMMAND" = "launch-cluster" ] ; then
+  . "$bin"/launch-hadoop-cluster
+elif [ "$COMMAND" = "start-hadoop" ] ; then
+  . "$bin"/start-hadoop
+elif [ "$COMMAND" = "run" ] ; then
+  . "$bin"/run-hadoop-cluster
+elif [ "$COMMAND" = "login" ] ; then
+  . "$bin"/login-hadoop-cluster
+elif [ "$COMMAND" = "terminate-cluster" ] ; then
+  . "$bin"/terminate-hadoop-cluster
+fi
+  

+ 29 - 15
src/contrib/ec2/bin/hadoop-ec2-env.sh.template

@@ -9,32 +9,46 @@ AWS_ACCESS_KEY_ID=
 # Your Amazon AWS secret access key.
 AWS_SECRET_ACCESS_KEY=
 
-# The Amazon S3 bucket where the Hadoop AMI you create will be stored.
-S3_BUCKET=
-
-# SSH options used when connecting to EC2 instances.
-# Change the -i option to be the absolute path to your keypair that you set up in the Amazon Getting Started guide.
-SSH_OPTS='-i /home/<yourname>/id_rsa-gsg-keypair -o StrictHostKeyChecking=no'
-
 # Location of EC2 keys.
 # The default setting is probably OK if you set up EC2 following the Amazon Getting Started guide.
-EC2_KEYDIR=~/.ec2
+EC2_KEYDIR=`dirname "$EC2_PRIVATE_KEY"`
 
-# The download URL for the Sun JDK. Visit http://java.sun.com/javase/downloads/index_jdk5.jsp and get the URL for the "Linux self-extracting file".
-JAVA_BINARY_URL=''
+# The EC2 key name used to launch instances.
+# The default is the value used in the Amazon Getting Started guide.
+KEY_NAME=gsg-keypair
 
-# The version number of the installed JDK.
-JAVA_VERSION=1.5.0_11
+# Where your EC2 private key is stored (created when following the Amazon Getting Started guide).
+# You need to change this if you don't store this with your other EC2 keys.
+PRIVATE_KEY_PATH=`echo "$EC2_KEYDIR"/"id_rsa-$KEY_NAME"`
+
+# SSH options used when connecting to EC2 instances.
+SSH_OPTS=`echo -i "$PRIVATE_KEY_PATH" -o StrictHostKeyChecking=no`
+
+# The version of Hadoop to use.
+HADOOP_VERSION=0.11.2
+
+# The Amazon S3 bucket where the Hadoop AMI is stored.
+# The default value is for public images, so can be left if you are using running a public image.
+# Change this value only if you are creating your own (private) AMI
+# so you can store it in a bucket you own.
+S3_BUCKET=hadoop-ec2-images
 
 # The EC2 group to run your cluster in.
 GROUP=hadoop-cluster-group
 
-# The version of Hadoop to install.
-HADOOP_VERSION=0.10.1
-
 # The hostname of the master node in the cluster. You need to be able to set the DNS for this host to point to the master's IP address.
 # See http://www.dyndns.com/services/dns/dyndns/, for example.
 MASTER_HOST=
 
 # The number of nodes in your cluster.
 NO_INSTANCES=2
+
+#
+# The following variables are only used when creating an AMI.
+#
+
+# The download URL for the Sun JDK. Visit http://java.sun.com/javase/downloads/index_jdk5.jsp and get the URL for the "Linux self-extracting file".
+JAVA_BINARY_URL=''
+
+# The version number of the installed JDK.
+JAVA_VERSION=1.5.0_11

+ 12 - 8
src/contrib/ec2/bin/image/create-hadoop-image-remote

@@ -6,6 +6,9 @@ bin=`dirname "$0"`
 bin=`cd "$bin"; pwd`
 . "$bin"/hadoop-ec2-env.sh
 
+# Remove environment script since it contains sensitive information
+rm -f "$bin"/hadoop-ec2-env.sh
+
 # Install Java
 cd /usr/local
 wget -nv -O java.bin $JAVA_BINARY_URL
@@ -28,19 +31,20 @@ sed -i -e "s|# export JAVA_HOME=.*|export JAVA_HOME=/usr/local/jdk${JAVA_VERSION
       /usr/local/hadoop-$HADOOP_VERSION/conf/hadoop-env.sh
 mkdir -p /mnt/hadoop/logs
 
-# Do Hadoop configuration for master hostname and cluster size on instance startup for runlevels 3 and 4.
-# Runlevel 4 is used by Xen. See http://developer.amazonwebservices.com/connect/message.jspa?messageID=45948#45948
-ln -s /etc/init.d/hadoop-init /etc/rc3.d/S99hadoop-init
-ln -s /etc/init.d/hadoop-init /etc/rc4.d/S99hadoop-init
+# Do configuration on instance startup
+echo "/root/hadoop-init" >> /etc/rc.d/rc.local
 
-# Configure networking
-ssh-keygen -t dsa -P '' -f ~/.ssh/id_dsa
-cat ~/.ssh/id_dsa.pub >> ~/.ssh/authorized_keys
+# Configure networking.
+# Delete SSH authorized_keys since it includes the key it was launched with. (Note that it is re-populated when an instance starts.)
+rm -f /root/.ssh/authorized_keys
+# Ensure logging in to new hosts is seamless.
 echo '    StrictHostKeyChecking no' >> /etc/ssh/ssh_config
 
 # Bundle and upload image
 cd ~root
-ec2-bundle-vol -d /mnt -k ~root/pk-*.pem -c ~root/cert-*.pem -u $AWS_ACCOUNT_ID -s 1536 -p hadoop-$HADOOP_VERSION
+# Don't need to delete .bash_history since it isn't written until exit.
+ec2-bundle-vol -d /mnt -k /mnt/pk-*.pem -c /mnt/cert-*.pem -u $AWS_ACCOUNT_ID -s 1536 -p hadoop-$HADOOP_VERSION
+rm /mnt/pk-*.pem /mnt/cert-*.pem
 ec2-upload-bundle -b $S3_BUCKET -m /mnt/hadoop-$HADOOP_VERSION.manifest.xml -a $AWS_ACCESS_KEY_ID -s $AWS_SECRET_ACCESS_KEY
 
 # End

+ 11 - 32
src/contrib/ec2/bin/image/hadoop-init

@@ -1,16 +1,14 @@
-#!/bin/sh
-
-start() {
-  USER_DATA=`wget -q -O - http://169.254.169.254/1.0/user-data`
-  NO_INSTANCES=`python -c "print '$USER_DATA'.split(',')[0]"`
-  MASTER_HOST=`python -c "print '$USER_DATA'.split(',')[1]"`
-  HADOOP_HOME=`ls -d /usr/local/hadoop-*`
-  echo $NO_INSTANCES, $MASTER_HOST, $HADOOP_HOME
-  
-  sed -i -e "s|# export HADOOP_MASTER=.*|export HADOOP_MASTER=$MASTER_HOST:$HADOOP_HOME|" \
-      $HADOOP_HOME/conf/hadoop-env.sh
+# Use parameters passed in during launch to configure Hadoop
+USER_DATA=`wget -q -O - http://169.254.169.254/1.0/user-data`
+NO_INSTANCES=`python -c "print '$USER_DATA'.split(',')[0]"`
+MASTER_HOST=`python -c "print '$USER_DATA'.split(',')[1]"`
+HADOOP_HOME=`ls -d /usr/local/hadoop-*`
+echo $NO_INSTANCES, $MASTER_HOST, $HADOOP_HOME
+ 
+sed -i -e "s|# export HADOOP_MASTER=.*|export HADOOP_MASTER=$MASTER_HOST:$HADOOP_HOME|" \
+    $HADOOP_HOME/conf/hadoop-env.sh
       
-  cat > $HADOOP_HOME/conf/hadoop-site.xml <<EOF
+cat > $HADOOP_HOME/conf/hadoop-site.xml <<EOF
 <?xml version="1.0"?>
 <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 
@@ -34,7 +32,7 @@ start() {
 </configuration>
 EOF
 
-  cat > $HADOOP_HOME/conf/mapred-default.xml <<EOF
+cat > $HADOOP_HOME/conf/mapred-default.xml <<EOF
 <?xml version="1.0"?>
 <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 
@@ -52,22 +50,3 @@ EOF
 
 </configuration>
 EOF
-
-}
-
-case "$1" in
-  start)
-        start
-        ;;
-  stop)
-        ;;
-  status)
-        ;;
-  restart|reload|condrestart)
-        ;;
-  *)
-        echo $"Usage: $0 {start|stop|restart|reload|status}"
-        exit 1
-esac
-
-exit 0

+ 41 - 0
src/contrib/ec2/bin/launch-hadoop-cluster

@@ -0,0 +1,41 @@
+#!/bin/sh
+# Launch an EC2 cluster of Hadoop instances.
+
+# Import variables
+bin=`dirname "$0"`
+bin=`cd "$bin"; pwd`
+. "$bin"/hadoop-ec2-env.sh
+
+ec2-describe-group | grep $GROUP > /dev/null
+if [ ! $? -eq 0 ]; then
+  echo "Creating group $GROUP"
+  ec2-add-group $GROUP -d "Group for Hadoop clusters."
+  ec2-authorize $GROUP -p 22    # ssh
+  ec2-authorize $GROUP -p 50030 # JobTracker web interface
+  ec2-authorize $GROUP -p 50060 # TaskTracker web interface
+  ec2-authorize $GROUP -o $GROUP -u $AWS_ACCOUNT_ID 
+fi
+
+# Finding Hadoop image
+AMI_IMAGE=`ec2-describe-images -a | grep $S3_BUCKET | grep $HADOOP_VERSION | grep available | awk '{print $2}'`
+
+# Start a cluster
+echo "Starting cluster with AMI $AMI_IMAGE"
+RUN_INSTANCES_OUTPUT=`ec2-run-instances $AMI_IMAGE -n $NO_INSTANCES -g $GROUP -k gsg-keypair -d "$NO_INSTANCES,$MASTER_HOST" | grep INSTANCE | awk '{print $2}'`
+for instance in $RUN_INSTANCES_OUTPUT; do
+  echo "Waiting for instance $instance to start"
+  while true; do
+    printf "."
+    HOSTNAME=`ec2-describe-instances $instance | grep running | awk '{print $4}'`
+    if [ ! -z $HOSTNAME ]; then
+      echo "started as $HOSTNAME"
+      break;
+    fi
+    sleep 1
+  done
+done
+
+echo "Appointing master"
+MASTER_EC2_HOST=`ec2-describe-instances | grep INSTANCE | grep running | awk '{if ($7 == 0) print $4}'`
+MASTER_IP=`dig +short $MASTER_EC2_HOST`
+echo "Master is $MASTER_EC2_HOST. Please set up DNS so $MASTER_HOST points to $MASTER_IP."

+ 10 - 0
src/contrib/ec2/bin/login-hadoop-cluster

@@ -0,0 +1,10 @@
+#!/bin/sh
+# Login to the master node of a running Hadoop EC2 cluster.
+
+# Import variables
+bin=`dirname "$0"`
+bin=`cd "$bin"; pwd`
+. "$bin"/hadoop-ec2-env.sh
+
+echo "Logging in to master $MASTER_HOST."
+ssh $SSH_OPTS "root@$MASTER_HOST"

+ 5 - 47
src/contrib/ec2/bin/run-hadoop-cluster

@@ -1,60 +1,18 @@
 #!/bin/sh
-# Launch an EC2 cluster of Hadoop instances and connect to the master.
+# Launch an EC2 cluster of Hadoop instances, start Hadoop, and connect to the master.
 
 # Import variables
 bin=`dirname "$0"`
 bin=`cd "$bin"; pwd`
-. "$bin"/hadoop-ec2-env.sh
 
-ec2-describe-group | grep $GROUP > /dev/null
-if [ ! $? -eq 0 ]; then
-  echo "Creating group $GROUP"
-  ec2-add-group $GROUP -d "Group for Hadoop clusters."
-  ec2-authorize $GROUP -p 22    # ssh
-  ec2-authorize $GROUP -p 50030 # JobTracker web interface
-  ec2-authorize $GROUP -p 50060 # TaskTracker web interface
-  ec2-authorize $GROUP -o $GROUP -u $AWS_ACCOUNT_ID 
+if ! "$bin"/launch-hadoop-cluster ; then
+  exit $?
 fi
 
-# Finding Hadoop image
-AMI_IMAGE=`ec2-describe-images -a | grep $S3_BUCKET | grep available | awk '{print $2}'`
-
-# Start a cluster
-echo "Starting cluster with AMI $AMI_IMAGE"
-RUN_INSTANCES_OUTPUT=`ec2-run-instances $AMI_IMAGE -n $NO_INSTANCES -g $GROUP -k gsg-keypair -d "$NO_INSTANCES,$MASTER_HOST" | grep INSTANCE | awk '{print $2}'`
-for instance in $RUN_INSTANCES_OUTPUT; do
-  echo "Waiting for instance $instance to start"
-  while true; do
-    printf "."
-    HOSTNAME=`ec2-describe-instances $instance | grep running | awk '{print $4}'`
-    if [ ! -z $HOSTNAME ]; then
-      echo "started as $HOSTNAME"
-      break;
-    fi
-    sleep 1
-  done
-done
-
-echo "Appointing master"
-MASTER_EC2_HOST=`ec2-describe-instances | grep INSTANCE | grep running | awk '{if ($7 == 0) print $4}'`
-MASTER_IP=`dig +short $MASTER_EC2_HOST`
-echo "Master is $MASTER_EC2_HOST. Please set up DNS so $MASTER_HOST points to $MASTER_IP then press return to continue."
+echo "Press return to continue."
 read dummy
 
 echo "Waiting before trying to connect..."
 sleep 30
 
-echo "Creating slaves file and copying to master"
-ec2-describe-instances | grep INSTANCE | grep running | awk '{if ($7 != 0) print $4}' > slaves
-scp $SSH_OPTS slaves "root@$MASTER_HOST:/usr/local/hadoop-$HADOOP_VERSION/conf/slaves"
-
-echo "Formatting new cluster's filesystem"
-ssh $SSH_OPTS "root@$MASTER_HOST" "/usr/local/hadoop-$HADOOP_VERSION/bin/hadoop namenode -format"
-
-echo "Starting cluster"
-ssh $SSH_OPTS "root@$MASTER_HOST" "/usr/local/hadoop-$HADOOP_VERSION/bin/start-all.sh"
-
-echo "Finished - check progress at http://$MASTER_HOST:50030/"
-
-echo "Logging in to master $MASTER_HOST."
-ssh $SSH_OPTS "root@$MASTER_HOST"
+"$bin"/start-hadoop

+ 39 - 0
src/contrib/ec2/bin/start-hadoop

@@ -0,0 +1,39 @@
+#!/bin/sh
+# Start Hadoop on a cluster.
+
+# Import variables
+bin=`dirname "$0"`
+bin=`cd "$bin"; pwd`
+. "$bin"/hadoop-ec2-env.sh
+
+echo "Asking master to say hello"
+if ! ssh $SSH_OPTS "root@$MASTER_HOST" echo "hello" ; then
+  echo "SSH failed for root@$MASTER_HOST"
+  exit 1
+fi
+
+echo "Creating slaves file and copying to master"
+ec2-describe-instances | grep INSTANCE | grep running | awk '{if ($7 != 0) print $4}' > slaves
+scp $SSH_OPTS slaves "root@$MASTER_HOST:/usr/local/hadoop-$HADOOP_VERSION/conf/slaves"
+
+echo "Copying private key to master"
+scp $SSH_OPTS $PRIVATE_KEY_PATH "root@$MASTER_HOST:/root/.ssh/id_rsa"
+ssh $SSH_OPTS "root@$MASTER_HOST" "chmod 600 /root/.ssh/id_rsa"
+
+echo "Copying private key to slaves"
+for slave in `cat slaves`; do
+  scp $SSH_OPTS $PRIVATE_KEY_PATH "root@$slave:/root/.ssh/id_rsa"
+  ssh $SSH_OPTS "root@$slave" "chmod 600 /root/.ssh/id_rsa"
+  sleep 1
+done
+
+echo "Formatting new cluster's filesystem"
+ssh $SSH_OPTS "root@$MASTER_HOST" "/usr/local/hadoop-$HADOOP_VERSION/bin/hadoop namenode -format"
+
+echo "Starting cluster"
+ssh $SSH_OPTS "root@$MASTER_HOST" "/usr/local/hadoop-$HADOOP_VERSION/bin/start-all.sh"
+
+echo "Finished - check progress at http://$MASTER_HOST:50030/"
+
+echo "Logging in to master $MASTER_HOST."
+ssh $SSH_OPTS "root@$MASTER_HOST"