Browse Source

HADOOP-952. Create a public (shared) Hadoop EC2 AMI.

git-svn-id: https://svn.apache.org/repos/asf/lucene/hadoop/trunk@510224 13f79535-47bb-0310-9956-ffa450edef68
Thomas White 18 years ago
parent
commit
8f165b5fd7

+ 5 - 1
CHANGES.txt

@@ -86,9 +86,13 @@ Trunk (unreleased changes)
     directly, so that user code is no longer required in the
     directly, so that user code is no longer required in the
     JobTracker.  (omalley via cutting)
     JobTracker.  (omalley via cutting)
 
 
-26. HAOOP-1006.  Remove obsolete '-local' option from test code.
+26. HADOOP-1006.  Remove obsolete '-local' option from test code.
     (Gautam Kowshik via cutting)
     (Gautam Kowshik via cutting)
 
 
+27. HADOOP-952. Create a public (shared) Hadoop EC2 AMI.
+    The EC2 scripts now support launch of public AMIs.
+    (tomwhite)
+    
 
 
 Release 0.11.2 - 2007-02-16
 Release 0.11.2 - 2007-02-16
 
 

+ 4 - 0
src/contrib/ec2/README.txt

@@ -3,6 +3,10 @@ Hadoop EC2
 This collection of scripts allows you to run Hadoop clusters on Amazon.com's Elastic Compute Cloud (EC2) service described at:
 This collection of scripts allows you to run Hadoop clusters on Amazon.com's Elastic Compute Cloud (EC2) service described at:
 
 
   http://aws.amazon.com/ec2
   http://aws.amazon.com/ec2
+  
+To get help, type the following in a shell:
+  
+  bin/hadoop-ec2
 
 
 For full instructions, please visit the Hadoop wiki at:
 For full instructions, please visit the Hadoop wiki at:
 
 

+ 4 - 4
src/contrib/ec2/bin/create-hadoop-image

@@ -35,17 +35,17 @@ echo "Copying scripts."
 
 
 # Copy setup scripts
 # Copy setup scripts
 scp $SSH_OPTS "$bin"/hadoop-ec2-env.sh "root@$HOSTNAME:"
 scp $SSH_OPTS "$bin"/hadoop-ec2-env.sh "root@$HOSTNAME:"
-scp $SSH_OPTS "$bin"/image/hadoop-init "root@$HOSTNAME:/etc/init.d/hadoop-init"
+scp $SSH_OPTS "$bin"/image/hadoop-init "root@$HOSTNAME:"
 scp $SSH_OPTS "$bin"/image/create-hadoop-image-remote "root@$HOSTNAME:"
 scp $SSH_OPTS "$bin"/image/create-hadoop-image-remote "root@$HOSTNAME:"
 
 
 # Copy private key and certificate (for bundling image)
 # Copy private key and certificate (for bundling image)
-scp $SSH_OPTS $EC2_KEYDIR/pk-*.pem "root@$HOSTNAME:"
-scp $SSH_OPTS $EC2_KEYDIR/cert-*.pem "root@$HOSTNAME:"
+scp $SSH_OPTS $EC2_KEYDIR/pk-*.pem "root@$HOSTNAME:/mnt"
+scp $SSH_OPTS $EC2_KEYDIR/cert-*.pem "root@$HOSTNAME:/mnt"
 
 
 # Connect to it
 # Connect to it
 ssh $SSH_OPTS "root@$HOSTNAME" './create-hadoop-image-remote'
 ssh $SSH_OPTS "root@$HOSTNAME" './create-hadoop-image-remote'
 
 
 # Register image
 # Register image
-ec2-register $S3_BUCKET/image.manifest.xml
+ec2-register $S3_BUCKET/hadoop-$HADOOP_VERSION.manifest.xml
 
 
 echo "Terminate with: ec2-terminate-instances $BOOTING_INSTANCE"
 echo "Terminate with: ec2-terminate-instances $BOOTING_INSTANCE"

+ 36 - 0
src/contrib/ec2/bin/hadoop-ec2

@@ -0,0 +1,36 @@
+#!/bin/sh
+
+bin=`dirname "$0"`
+bin=`cd "$bin"; pwd`
+
+# if no args specified, show usage
+if [ $# = 0 ]; then
+  echo "Usage: hadoop-ec2 COMMAND"
+  echo "where COMMAND is one of:"
+  echo "  create-image         create a Hadoop AMI"
+  echo "  launch-cluster       launch a cluster of Hadoop EC2 instances"
+  echo "  start-hadoop         start Hadoop daemons on a cluster"
+  echo "  login                login to the master node of the Hadoop EC2 cluster"
+  echo "  run                  'launch-cluster', 'start-hadoop', 'login'"
+  echo "  terminate-cluster    terminate a cluster of Hadoop EC2 instances"
+  exit 1
+fi
+
+# get arguments
+COMMAND=$1
+shift
+
+if [ "$COMMAND" = "create-image" ] ; then
+  . "$bin"/create-hadoop-image
+elif [ "$COMMAND" = "launch-cluster" ] ; then
+  . "$bin"/launch-hadoop-cluster
+elif [ "$COMMAND" = "start-hadoop" ] ; then
+  . "$bin"/start-hadoop
+elif [ "$COMMAND" = "run" ] ; then
+  . "$bin"/run-hadoop-cluster
+elif [ "$COMMAND" = "login" ] ; then
+  . "$bin"/login-hadoop-cluster
+elif [ "$COMMAND" = "terminate-cluster" ] ; then
+  . "$bin"/terminate-hadoop-cluster
+fi
+  

+ 29 - 15
src/contrib/ec2/bin/hadoop-ec2-env.sh.template

@@ -9,32 +9,46 @@ AWS_ACCESS_KEY_ID=
 # Your Amazon AWS secret access key.
 # Your Amazon AWS secret access key.
 AWS_SECRET_ACCESS_KEY=
 AWS_SECRET_ACCESS_KEY=
 
 
-# The Amazon S3 bucket where the Hadoop AMI you create will be stored.
-S3_BUCKET=
-
-# SSH options used when connecting to EC2 instances.
-# Change the -i option to be the absolute path to your keypair that you set up in the Amazon Getting Started guide.
-SSH_OPTS='-i /home/<yourname>/id_rsa-gsg-keypair -o StrictHostKeyChecking=no'
-
 # Location of EC2 keys.
 # Location of EC2 keys.
 # The default setting is probably OK if you set up EC2 following the Amazon Getting Started guide.
 # The default setting is probably OK if you set up EC2 following the Amazon Getting Started guide.
-EC2_KEYDIR=~/.ec2
+EC2_KEYDIR=`dirname "$EC2_PRIVATE_KEY"`
 
 
-# The download URL for the Sun JDK. Visit http://java.sun.com/javase/downloads/index_jdk5.jsp and get the URL for the "Linux self-extracting file".
-JAVA_BINARY_URL=''
+# The EC2 key name used to launch instances.
+# The default is the value used in the Amazon Getting Started guide.
+KEY_NAME=gsg-keypair
 
 
-# The version number of the installed JDK.
-JAVA_VERSION=1.5.0_11
+# Where your EC2 private key is stored (created when following the Amazon Getting Started guide).
+# You need to change this if you don't store this with your other EC2 keys.
+PRIVATE_KEY_PATH=`echo "$EC2_KEYDIR"/"id_rsa-$KEY_NAME"`
+
+# SSH options used when connecting to EC2 instances.
+SSH_OPTS=`echo -i "$PRIVATE_KEY_PATH" -o StrictHostKeyChecking=no`
+
+# The version of Hadoop to use.
+HADOOP_VERSION=0.11.2
+
+# The Amazon S3 bucket where the Hadoop AMI is stored.
+# The default value is for public images, so can be left if you are using running a public image.
+# Change this value only if you are creating your own (private) AMI
+# so you can store it in a bucket you own.
+S3_BUCKET=hadoop-ec2-images
 
 
 # The EC2 group to run your cluster in.
 # The EC2 group to run your cluster in.
 GROUP=hadoop-cluster-group
 GROUP=hadoop-cluster-group
 
 
-# The version of Hadoop to install.
-HADOOP_VERSION=0.10.1
-
 # The hostname of the master node in the cluster. You need to be able to set the DNS for this host to point to the master's IP address.
 # The hostname of the master node in the cluster. You need to be able to set the DNS for this host to point to the master's IP address.
 # See http://www.dyndns.com/services/dns/dyndns/, for example.
 # See http://www.dyndns.com/services/dns/dyndns/, for example.
 MASTER_HOST=
 MASTER_HOST=
 
 
 # The number of nodes in your cluster.
 # The number of nodes in your cluster.
 NO_INSTANCES=2
 NO_INSTANCES=2
+
+#
+# The following variables are only used when creating an AMI.
+#
+
+# The download URL for the Sun JDK. Visit http://java.sun.com/javase/downloads/index_jdk5.jsp and get the URL for the "Linux self-extracting file".
+JAVA_BINARY_URL=''
+
+# The version number of the installed JDK.
+JAVA_VERSION=1.5.0_11

+ 12 - 8
src/contrib/ec2/bin/image/create-hadoop-image-remote

@@ -6,6 +6,9 @@ bin=`dirname "$0"`
 bin=`cd "$bin"; pwd`
 bin=`cd "$bin"; pwd`
 . "$bin"/hadoop-ec2-env.sh
 . "$bin"/hadoop-ec2-env.sh
 
 
+# Remove environment script since it contains sensitive information
+rm -f "$bin"/hadoop-ec2-env.sh
+
 # Install Java
 # Install Java
 cd /usr/local
 cd /usr/local
 wget -nv -O java.bin $JAVA_BINARY_URL
 wget -nv -O java.bin $JAVA_BINARY_URL
@@ -28,19 +31,20 @@ sed -i -e "s|# export JAVA_HOME=.*|export JAVA_HOME=/usr/local/jdk${JAVA_VERSION
       /usr/local/hadoop-$HADOOP_VERSION/conf/hadoop-env.sh
       /usr/local/hadoop-$HADOOP_VERSION/conf/hadoop-env.sh
 mkdir -p /mnt/hadoop/logs
 mkdir -p /mnt/hadoop/logs
 
 
-# Do Hadoop configuration for master hostname and cluster size on instance startup for runlevels 3 and 4.
-# Runlevel 4 is used by Xen. See http://developer.amazonwebservices.com/connect/message.jspa?messageID=45948#45948
-ln -s /etc/init.d/hadoop-init /etc/rc3.d/S99hadoop-init
-ln -s /etc/init.d/hadoop-init /etc/rc4.d/S99hadoop-init
+# Do configuration on instance startup
+echo "/root/hadoop-init" >> /etc/rc.d/rc.local
 
 
-# Configure networking
-ssh-keygen -t dsa -P '' -f ~/.ssh/id_dsa
-cat ~/.ssh/id_dsa.pub >> ~/.ssh/authorized_keys
+# Configure networking.
+# Delete SSH authorized_keys since it includes the key it was launched with. (Note that it is re-populated when an instance starts.)
+rm -f /root/.ssh/authorized_keys
+# Ensure logging in to new hosts is seamless.
 echo '    StrictHostKeyChecking no' >> /etc/ssh/ssh_config
 echo '    StrictHostKeyChecking no' >> /etc/ssh/ssh_config
 
 
 # Bundle and upload image
 # Bundle and upload image
 cd ~root
 cd ~root
-ec2-bundle-vol -d /mnt -k ~root/pk-*.pem -c ~root/cert-*.pem -u $AWS_ACCOUNT_ID -s 1536 -p hadoop-$HADOOP_VERSION
+# Don't need to delete .bash_history since it isn't written until exit.
+ec2-bundle-vol -d /mnt -k /mnt/pk-*.pem -c /mnt/cert-*.pem -u $AWS_ACCOUNT_ID -s 1536 -p hadoop-$HADOOP_VERSION
+rm /mnt/pk-*.pem /mnt/cert-*.pem
 ec2-upload-bundle -b $S3_BUCKET -m /mnt/hadoop-$HADOOP_VERSION.manifest.xml -a $AWS_ACCESS_KEY_ID -s $AWS_SECRET_ACCESS_KEY
 ec2-upload-bundle -b $S3_BUCKET -m /mnt/hadoop-$HADOOP_VERSION.manifest.xml -a $AWS_ACCESS_KEY_ID -s $AWS_SECRET_ACCESS_KEY
 
 
 # End
 # End

+ 11 - 32
src/contrib/ec2/bin/image/hadoop-init

@@ -1,16 +1,14 @@
-#!/bin/sh
-
-start() {
-  USER_DATA=`wget -q -O - http://169.254.169.254/1.0/user-data`
-  NO_INSTANCES=`python -c "print '$USER_DATA'.split(',')[0]"`
-  MASTER_HOST=`python -c "print '$USER_DATA'.split(',')[1]"`
-  HADOOP_HOME=`ls -d /usr/local/hadoop-*`
-  echo $NO_INSTANCES, $MASTER_HOST, $HADOOP_HOME
-  
-  sed -i -e "s|# export HADOOP_MASTER=.*|export HADOOP_MASTER=$MASTER_HOST:$HADOOP_HOME|" \
-      $HADOOP_HOME/conf/hadoop-env.sh
+# Use parameters passed in during launch to configure Hadoop
+USER_DATA=`wget -q -O - http://169.254.169.254/1.0/user-data`
+NO_INSTANCES=`python -c "print '$USER_DATA'.split(',')[0]"`
+MASTER_HOST=`python -c "print '$USER_DATA'.split(',')[1]"`
+HADOOP_HOME=`ls -d /usr/local/hadoop-*`
+echo $NO_INSTANCES, $MASTER_HOST, $HADOOP_HOME
+ 
+sed -i -e "s|# export HADOOP_MASTER=.*|export HADOOP_MASTER=$MASTER_HOST:$HADOOP_HOME|" \
+    $HADOOP_HOME/conf/hadoop-env.sh
       
       
-  cat > $HADOOP_HOME/conf/hadoop-site.xml <<EOF
+cat > $HADOOP_HOME/conf/hadoop-site.xml <<EOF
 <?xml version="1.0"?>
 <?xml version="1.0"?>
 <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 
 
@@ -34,7 +32,7 @@ start() {
 </configuration>
 </configuration>
 EOF
 EOF
 
 
-  cat > $HADOOP_HOME/conf/mapred-default.xml <<EOF
+cat > $HADOOP_HOME/conf/mapred-default.xml <<EOF
 <?xml version="1.0"?>
 <?xml version="1.0"?>
 <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 
 
@@ -52,22 +50,3 @@ EOF
 
 
 </configuration>
 </configuration>
 EOF
 EOF
-
-}
-
-case "$1" in
-  start)
-        start
-        ;;
-  stop)
-        ;;
-  status)
-        ;;
-  restart|reload|condrestart)
-        ;;
-  *)
-        echo $"Usage: $0 {start|stop|restart|reload|status}"
-        exit 1
-esac
-
-exit 0

+ 41 - 0
src/contrib/ec2/bin/launch-hadoop-cluster

@@ -0,0 +1,41 @@
+#!/bin/sh
+# Launch an EC2 cluster of Hadoop instances.
+
+# Import variables
+bin=`dirname "$0"`
+bin=`cd "$bin"; pwd`
+. "$bin"/hadoop-ec2-env.sh
+
+ec2-describe-group | grep $GROUP > /dev/null
+if [ ! $? -eq 0 ]; then
+  echo "Creating group $GROUP"
+  ec2-add-group $GROUP -d "Group for Hadoop clusters."
+  ec2-authorize $GROUP -p 22    # ssh
+  ec2-authorize $GROUP -p 50030 # JobTracker web interface
+  ec2-authorize $GROUP -p 50060 # TaskTracker web interface
+  ec2-authorize $GROUP -o $GROUP -u $AWS_ACCOUNT_ID 
+fi
+
+# Finding Hadoop image
+AMI_IMAGE=`ec2-describe-images -a | grep $S3_BUCKET | grep $HADOOP_VERSION | grep available | awk '{print $2}'`
+
+# Start a cluster
+echo "Starting cluster with AMI $AMI_IMAGE"
+RUN_INSTANCES_OUTPUT=`ec2-run-instances $AMI_IMAGE -n $NO_INSTANCES -g $GROUP -k gsg-keypair -d "$NO_INSTANCES,$MASTER_HOST" | grep INSTANCE | awk '{print $2}'`
+for instance in $RUN_INSTANCES_OUTPUT; do
+  echo "Waiting for instance $instance to start"
+  while true; do
+    printf "."
+    HOSTNAME=`ec2-describe-instances $instance | grep running | awk '{print $4}'`
+    if [ ! -z $HOSTNAME ]; then
+      echo "started as $HOSTNAME"
+      break;
+    fi
+    sleep 1
+  done
+done
+
+echo "Appointing master"
+MASTER_EC2_HOST=`ec2-describe-instances | grep INSTANCE | grep running | awk '{if ($7 == 0) print $4}'`
+MASTER_IP=`dig +short $MASTER_EC2_HOST`
+echo "Master is $MASTER_EC2_HOST. Please set up DNS so $MASTER_HOST points to $MASTER_IP."

+ 10 - 0
src/contrib/ec2/bin/login-hadoop-cluster

@@ -0,0 +1,10 @@
+#!/bin/sh
+# Login to the master node of a running Hadoop EC2 cluster.
+
+# Import variables
+bin=`dirname "$0"`
+bin=`cd "$bin"; pwd`
+. "$bin"/hadoop-ec2-env.sh
+
+echo "Logging in to master $MASTER_HOST."
+ssh $SSH_OPTS "root@$MASTER_HOST"

+ 5 - 47
src/contrib/ec2/bin/run-hadoop-cluster

@@ -1,60 +1,18 @@
 #!/bin/sh
 #!/bin/sh
-# Launch an EC2 cluster of Hadoop instances and connect to the master.
+# Launch an EC2 cluster of Hadoop instances, start Hadoop, and connect to the master.
 
 
 # Import variables
 # Import variables
 bin=`dirname "$0"`
 bin=`dirname "$0"`
 bin=`cd "$bin"; pwd`
 bin=`cd "$bin"; pwd`
-. "$bin"/hadoop-ec2-env.sh
 
 
-ec2-describe-group | grep $GROUP > /dev/null
-if [ ! $? -eq 0 ]; then
-  echo "Creating group $GROUP"
-  ec2-add-group $GROUP -d "Group for Hadoop clusters."
-  ec2-authorize $GROUP -p 22    # ssh
-  ec2-authorize $GROUP -p 50030 # JobTracker web interface
-  ec2-authorize $GROUP -p 50060 # TaskTracker web interface
-  ec2-authorize $GROUP -o $GROUP -u $AWS_ACCOUNT_ID 
+if ! "$bin"/launch-hadoop-cluster ; then
+  exit $?
 fi
 fi
 
 
-# Finding Hadoop image
-AMI_IMAGE=`ec2-describe-images -a | grep $S3_BUCKET | grep available | awk '{print $2}'`
-
-# Start a cluster
-echo "Starting cluster with AMI $AMI_IMAGE"
-RUN_INSTANCES_OUTPUT=`ec2-run-instances $AMI_IMAGE -n $NO_INSTANCES -g $GROUP -k gsg-keypair -d "$NO_INSTANCES,$MASTER_HOST" | grep INSTANCE | awk '{print $2}'`
-for instance in $RUN_INSTANCES_OUTPUT; do
-  echo "Waiting for instance $instance to start"
-  while true; do
-    printf "."
-    HOSTNAME=`ec2-describe-instances $instance | grep running | awk '{print $4}'`
-    if [ ! -z $HOSTNAME ]; then
-      echo "started as $HOSTNAME"
-      break;
-    fi
-    sleep 1
-  done
-done
-
-echo "Appointing master"
-MASTER_EC2_HOST=`ec2-describe-instances | grep INSTANCE | grep running | awk '{if ($7 == 0) print $4}'`
-MASTER_IP=`dig +short $MASTER_EC2_HOST`
-echo "Master is $MASTER_EC2_HOST. Please set up DNS so $MASTER_HOST points to $MASTER_IP then press return to continue."
+echo "Press return to continue."
 read dummy
 read dummy
 
 
 echo "Waiting before trying to connect..."
 echo "Waiting before trying to connect..."
 sleep 30
 sleep 30
 
 
-echo "Creating slaves file and copying to master"
-ec2-describe-instances | grep INSTANCE | grep running | awk '{if ($7 != 0) print $4}' > slaves
-scp $SSH_OPTS slaves "root@$MASTER_HOST:/usr/local/hadoop-$HADOOP_VERSION/conf/slaves"
-
-echo "Formatting new cluster's filesystem"
-ssh $SSH_OPTS "root@$MASTER_HOST" "/usr/local/hadoop-$HADOOP_VERSION/bin/hadoop namenode -format"
-
-echo "Starting cluster"
-ssh $SSH_OPTS "root@$MASTER_HOST" "/usr/local/hadoop-$HADOOP_VERSION/bin/start-all.sh"
-
-echo "Finished - check progress at http://$MASTER_HOST:50030/"
-
-echo "Logging in to master $MASTER_HOST."
-ssh $SSH_OPTS "root@$MASTER_HOST"
+"$bin"/start-hadoop

+ 39 - 0
src/contrib/ec2/bin/start-hadoop

@@ -0,0 +1,39 @@
+#!/bin/sh
+# Start Hadoop on a cluster.
+
+# Import variables
+bin=`dirname "$0"`
+bin=`cd "$bin"; pwd`
+. "$bin"/hadoop-ec2-env.sh
+
+echo "Asking master to say hello"
+if ! ssh $SSH_OPTS "root@$MASTER_HOST" echo "hello" ; then
+  echo "SSH failed for root@$MASTER_HOST"
+  exit 1
+fi
+
+echo "Creating slaves file and copying to master"
+ec2-describe-instances | grep INSTANCE | grep running | awk '{if ($7 != 0) print $4}' > slaves
+scp $SSH_OPTS slaves "root@$MASTER_HOST:/usr/local/hadoop-$HADOOP_VERSION/conf/slaves"
+
+echo "Copying private key to master"
+scp $SSH_OPTS $PRIVATE_KEY_PATH "root@$MASTER_HOST:/root/.ssh/id_rsa"
+ssh $SSH_OPTS "root@$MASTER_HOST" "chmod 600 /root/.ssh/id_rsa"
+
+echo "Copying private key to slaves"
+for slave in `cat slaves`; do
+  scp $SSH_OPTS $PRIVATE_KEY_PATH "root@$slave:/root/.ssh/id_rsa"
+  ssh $SSH_OPTS "root@$slave" "chmod 600 /root/.ssh/id_rsa"
+  sleep 1
+done
+
+echo "Formatting new cluster's filesystem"
+ssh $SSH_OPTS "root@$MASTER_HOST" "/usr/local/hadoop-$HADOOP_VERSION/bin/hadoop namenode -format"
+
+echo "Starting cluster"
+ssh $SSH_OPTS "root@$MASTER_HOST" "/usr/local/hadoop-$HADOOP_VERSION/bin/start-all.sh"
+
+echo "Finished - check progress at http://$MASTER_HOST:50030/"
+
+echo "Logging in to master $MASTER_HOST."
+ssh $SSH_OPTS "root@$MASTER_HOST"