瀏覽代碼

HADOOP-884. Add scripts in contrib/ec2 to facilitate running Hadoop on Amazon's EC2 cluster. Contributed by Tom White.

git-svn-id: https://svn.apache.org/repos/asf/lucene/hadoop/trunk@501249 13f79535-47bb-0310-9956-ffa450edef68
Doug Cutting 18 年之前
父節點
當前提交
72aac6fd00

+ 3 - 0
CHANGES.txt

@@ -92,6 +92,9 @@ Trunk (unreleased changes)
 28. HADOOP-227.  Add support for backup namenodes, which periodically
     get snapshots of the namenode state.  (Dhruba Borthakur via cutting) 
 
+29. HADOOP-884.  Add scripts in contrib/ec2 to facilitate running
+    Hadoop on an Amazon's EC2 cluster.  (Tom White via cutting)
+
 
 Release 0.10.1 - 2007-01-10
 

+ 51 - 0
src/contrib/ec2/bin/create-hadoop-image

@@ -0,0 +1,51 @@
+#!/bin/sh
+# Create a Hadoop AMI.
+# Inspired by Jonathan Siegel's EC2 script (http://blogsiegel.blogspot.com/2006/08/sandboxing-amazon-ec2.html)
+
+# Import variables
+bin=`dirname "$0"`
+bin=`cd "$bin"; pwd`
+. "$bin"/hadoop-ec2-env.sh
+
+# Use fedora core
+AMI_IMAGE=`ec2-describe-images -a | grep fedora-core4-base | awk '{print $2}'`
+
+echo "Starting a fedora core base AMI with ID $AMI_IMAGE."
+OUTPUT=`ec2-run-instances $AMI_IMAGE -k gsg-keypair`
+BOOTING_INSTANCE=`echo $OUTPUT | awk '{print $6}'`
+
+echo "Instance is $BOOTING_INSTANCE."
+
+echo "Polling server status (ec2-describe-instances $BOOTING_INSTANCE)"
+while true; do
+  printf "."
+  HOSTNAME=`ec2-describe-instances $BOOTING_INSTANCE | grep running | awk '{print $4}'`
+  if [ ! -z $HOSTNAME ]; then
+    break;
+  fi
+  sleep 1
+done
+
+echo "The server is available at $HOSTNAME."
+
+echo "Waiting before trying to connect..."
+sleep 30
+
+echo "Copying scripts."
+
+# Copy setup scripts
+scp $SSH_OPTS "$bin"/hadoop-ec2-env.sh "root@$HOSTNAME:"
+scp $SSH_OPTS "$bin"/image/hadoop-init "root@$HOSTNAME:/etc/init.d/hadoop-init"
+scp $SSH_OPTS "$bin"/image/create-hadoop-image-remote "root@$HOSTNAME:"
+
+# Copy private key and certificate (for bundling image)
+scp $SSH_OPTS $EC2_KEYDIR/pk-*.pem "root@$HOSTNAME:"
+scp $SSH_OPTS $EC2_KEYDIR/cert-*.pem "root@$HOSTNAME:"
+
+# Connect to it
+ssh $SSH_OPTS "root@$HOSTNAME" './create-hadoop-image-remote'
+
+# Register image
+ec2-register $S3_BUCKET/image.manifest.xml
+
+echo "Terminate with: ec2-terminate-instances $BOOTING_INSTANCE"

+ 40 - 0
src/contrib/ec2/bin/hadoop-ec2-env.sh

@@ -0,0 +1,40 @@
+# Set environment variables for running Hadoop on Amazon EC2 here. All are required.
+
+# Your Amazon Account Number.
+AWS_ACCOUNT_ID=
+
+# Your Amazon AWS access key.
+AWS_ACCESS_KEY_ID=
+
+# Your Amazon AWS secret access key.
+AWS_SECRET_ACCESS_KEY=
+
+# The Amazon S3 bucket where the Hadoop AMI you create will be stored.
+S3_BUCKET=
+
+# SSH options used when connecting to EC2 instances.
+# Change the -i option to be the absolute path to your keypair that you set up in the Amazon Getting Started guide.
+SSH_OPTS='-i /home/<yourname>/id_rsa-gsg-keypair -o StrictHostKeyChecking=no'
+
+# Location of EC2 keys.
+# The default setting is probably OK if you set up EC2 following the Amazon Getting Started guide.
+EC2_KEYDIR=~/.ec2
+
+# The download URL for the Sun JDK. Visit http://java.sun.com/javase/downloads/index_jdk5.jsp and get the URL for the "Linux self-extracting file".
+JAVA_BINARY_URL=''
+
+# The version number of the installed JDK.
+JAVA_VERSION=1.5.0_11
+
+# The EC2 group to run your cluster in.
+GROUP=hadoop-cluster-group
+
+# The version of Hadoop to install.
+HADOOP_VERSION=0.10.1
+
+# The hostname of the master node in the cluster. You need to be able to set the DNS for this host to point to the master's IP address.
+# See http://www.dyndns.com/services/dns/dyndns/, for example.
+MASTER_HOST=
+
+# The number of nodes in your cluster.
+NO_INSTANCES=2

+ 47 - 0
src/contrib/ec2/bin/image/create-hadoop-image-remote

@@ -0,0 +1,47 @@
+#!/bin/sh
+# Create a Hadoop AMI. Runs on the EC2 instance.
+
+# Import variables
+bin=`dirname "$0"`
+bin=`cd "$bin"; pwd`
+. "$bin"/hadoop-ec2-env.sh
+
+# Install Java
+cd /usr/local
+wget -nv -O java.bin $JAVA_BINARY_URL
+sh java.bin
+rm -f java.bin
+
+# Install tools
+yum install rsync
+
+# Install Hadoop
+cd /usr/local
+wget -nv http://www.apache.org/dist/lucene/hadoop/hadoop-$HADOOP_VERSION.tar.gz
+tar xzf hadoop-$HADOOP_VERSION.tar.gz
+rm -f hadoop-$HADOOP_VERSION.tar.gz
+
+# Configure Hadoop
+sed -i -e "s|# export JAVA_HOME=.*|export JAVA_HOME=/usr/local/jdk${JAVA_VERSION}|" \
+       -e 's|# export HADOOP_LOG_DIR=.*|export HADOOP_LOG_DIR=/mnt/hadoop/logs|' \
+       -e 's|# export HADOOP_SLAVE_SLEEP=.*|export HADOOP_SLAVE_SLEEP=1|' \
+      /usr/local/hadoop-$HADOOP_VERSION/conf/hadoop-env.sh
+mkdir -p /mnt/hadoop/logs
+
+# Do Hadoop configuration for master hostname and cluster size on instance startup for runlevels 3 and 4.
+# Runlevel 4 is used by Xen. See http://developer.amazonwebservices.com/connect/message.jspa?messageID=45948#45948
+ln -s /etc/init.d/hadoop-init /etc/rc3.d/S99hadoop-init
+ln -s /etc/init.d/hadoop-init /etc/rc4.d/S99hadoop-init
+
+# Configure networking
+ssh-keygen -t dsa -P '' -f ~/.ssh/id_dsa
+cat ~/.ssh/id_dsa.pub >> ~/.ssh/authorized_keys
+echo '    StrictHostKeyChecking no' >> /etc/ssh/ssh_config
+
+# Bundle and upload image
+cd ~root
+ec2-bundle-vol -d /mnt -k ~root/pk-*.pem -c ~root/cert-*.pem -u $AWS_ACCOUNT_ID -s 1536 -p hadoop-$HADOOP_VERSION
+ec2-upload-bundle -b $S3_BUCKET -m /mnt/hadoop-$HADOOP_VERSION.manifest.xml -a $AWS_ACCESS_KEY_ID -s $AWS_SECRET_ACCESS_KEY
+
+# End
+echo Done

+ 73 - 0
src/contrib/ec2/bin/image/hadoop-init

@@ -0,0 +1,73 @@
+#!/bin/sh
+
+start() {
+  USER_DATA=`wget -q -O - http://169.254.169.254/1.0/user-data`
+  NO_INSTANCES=`python -c "print '$USER_DATA'.split(',')[0]"`
+  MASTER_HOST=`python -c "print '$USER_DATA'.split(',')[1]"`
+  HADOOP_HOME=`ls -d /usr/local/hadoop-*`
+  echo $NO_INSTANCES, $MASTER_HOST, $HADOOP_HOME
+  
+  sed -i -e "s|# export HADOOP_MASTER=.*|export HADOOP_MASTER=$MASTER_HOST:$HADOOP_HOME|" \
+      $HADOOP_HOME/conf/hadoop-env.sh
+      
+  cat > $HADOOP_HOME/conf/hadoop-site.xml <<EOF
+<?xml version="1.0"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+
+<configuration>
+
+<property>
+  <name>hadoop.tmp.dir</name>
+  <value>/mnt/hadoop</value>
+</property>
+
+<property>
+  <name>fs.default.name</name>
+  <value>$MASTER_HOST:50001</value>
+</property>
+
+<property>
+  <name>mapred.job.tracker</name>
+  <value>$MASTER_HOST:50002</value>
+</property>
+
+</configuration>
+EOF
+
+  cat > $HADOOP_HOME/conf/mapred-default.xml <<EOF
+<?xml version="1.0"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+
+<configuration>
+
+<property>
+  <name>mapred.map.tasks</name>
+  <value>$(( NO_INSTANCES * 10 ))</value>
+</property>
+
+<property>
+  <name>mapred.reduce.tasks</name>
+  <value>$(( NO_INSTANCES * 3 ))</value>
+</property>
+
+</configuration>
+EOF
+
+}
+
+case "$1" in
+  start)
+        start
+        ;;
+  stop)
+        ;;
+  status)
+        ;;
+  restart|reload|condrestart)
+        ;;
+  *)
+        echo $"Usage: $0 {start|stop|restart|reload|status}"
+        exit 1
+esac
+
+exit 0

+ 60 - 0
src/contrib/ec2/bin/run-hadoop-cluster

@@ -0,0 +1,60 @@
+#!/bin/sh
+# Launch an EC2 cluster of Hadoop instances and connect to the master.
+
+# Import variables
+bin=`dirname "$0"`
+bin=`cd "$bin"; pwd`
+. "$bin"/hadoop-ec2-env.sh
+
+ec2-describe-group | grep $GROUP > /dev/null
+if [ ! $? -eq 0 ]; then
+  echo "Creating group $GROUP"
+  ec2-add-group $GROUP -d "Group for Hadoop clusters."
+  ec2-authorize $GROUP -p 22    # ssh
+  ec2-authorize $GROUP -p 50030 # JobTracker web interface
+  ec2-authorize $GROUP -p 50060 # TaskTracker web interface
+  ec2-authorize $GROUP -o $GROUP -u $AWS_ACCOUNT_ID 
+fi
+
+# Finding Hadoop image
+AMI_IMAGE=`ec2-describe-images -a | grep $S3_BUCKET | grep available | awk '{print $2}'`
+
+# Start a cluster
+echo "Starting cluster with AMI $AMI_IMAGE"
+RUN_INSTANCES_OUTPUT=`ec2-run-instances $AMI_IMAGE -n $NO_INSTANCES -g $GROUP -k gsg-keypair -d "$NO_INSTANCES,$MASTER_HOST" | grep INSTANCE | awk '{print $2}'`
+for instance in $RUN_INSTANCES_OUTPUT; do
+  echo "Waiting for instance $instance to start"
+  while true; do
+    printf "."
+    HOSTNAME=`ec2-describe-instances $instance | grep running | awk '{print $4}'`
+    if [ ! -z $HOSTNAME ]; then
+      echo "started as $HOSTNAME"
+      break;
+    fi
+    sleep 1
+  done
+done
+
+echo "Appointing master"
+MASTER_EC2_HOST=`ec2-describe-instances | grep INSTANCE | grep running | awk '{if ($7 == 0) print $4}'`
+MASTER_IP=`dig +short $MASTER_EC2_HOST`
+echo "Master is $MASTER_EC2_HOST. Please set up DNS so $MASTER_HOST points to $MASTER_IP then press return to continue."
+read dummy
+
+echo "Waiting before trying to connect..."
+sleep 30
+
+echo "Creating slaves file and copying to master"
+ec2-describe-instances | grep INSTANCE | grep running | awk '{if ($7 != 0) print $4}' > slaves
+scp $SSH_OPTS slaves "root@$MASTER_HOST:/usr/local/hadoop-$HADOOP_VERSION/conf/slaves"
+
+echo "Formatting new cluster's filesystem"
+ssh $SSH_OPTS "root@$MASTER_HOST" "/usr/local/hadoop-$HADOOP_VERSION/bin/hadoop namenode -format"
+
+echo "Starting cluster"
+ssh $SSH_OPTS "root@$MASTER_HOST" "/usr/local/hadoop-$HADOOP_VERSION/bin/start-all.sh"
+
+echo "Finished - check progress at http://$MASTER_HOST:50030/"
+
+echo "Logging in to master $MASTER_HOST."
+ssh $SSH_OPTS "root@$MASTER_HOST"

+ 9 - 0
src/contrib/ec2/bin/terminate-hadoop-cluster

@@ -0,0 +1,9 @@
+#!/bin/sh
+# Terminate a cluster.
+
+# Import variables
+bin=`dirname "$0"`
+bin=`cd "$bin"; pwd`
+. "$bin"/hadoop-ec2-env.sh
+
+ec2-terminate-instances `ec2-describe-instances | grep INSTANCE | awk '{print $2}'`