18 år sedan · 72aac6fd00
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -92,6 +92,9 @@ Trunk (unreleased changes)
 
				 28. HADOOP-227.  Add support for backup namenodes, which periodically
			
 
				     get snapshots of the namenode state.  (Dhruba Borthakur via cutting) 
			
 
				 
			
 
				+29. HADOOP-884.  Add scripts in contrib/ec2 to facilitate running
			
 
				+    Hadoop on an Amazon's EC2 cluster.  (Tom White via cutting)
			
 
				+
			
 
				 
			
 
				 Release 0.10.1 - 2007-01-10
			
 
				 
			
--- a/src/contrib/ec2/bin/create-hadoop-image
+++ b/src/contrib/ec2/bin/create-hadoop-image
@@ -0,0 +1,51 @@
 
				+#!/bin/sh
			
 
				+# Create a Hadoop AMI.
			
 
				+# Inspired by Jonathan Siegel's EC2 script (http://blogsiegel.blogspot.com/2006/08/sandboxing-amazon-ec2.html)
			
 
				+
			
 
				+# Import variables
			
 
				+bin=`dirname "$0"`
			
 
				+bin=`cd "$bin"; pwd`
			
 
				+. "$bin"/hadoop-ec2-env.sh
			
 
				+
			
 
				+# Use fedora core
			
 
				+AMI_IMAGE=`ec2-describe-images -a | grep fedora-core4-base | awk '{print $2}'`
			
 
				+
			
 
				+echo "Starting a fedora core base AMI with ID $AMI_IMAGE."
			
 
				+OUTPUT=`ec2-run-instances $AMI_IMAGE -k gsg-keypair`
			
 
				+BOOTING_INSTANCE=`echo $OUTPUT | awk '{print $6}'`
			
 
				+
			
 
				+echo "Instance is $BOOTING_INSTANCE."
			
 
				+
			
 
				+echo "Polling server status (ec2-describe-instances $BOOTING_INSTANCE)"
			
 
				+while true; do
			
 
				+  printf "."
			
 
				+  HOSTNAME=`ec2-describe-instances $BOOTING_INSTANCE | grep running | awk '{print $4}'`
			
 
				+  if [ ! -z $HOSTNAME ]; then
			
 
				+    break;
			
 
				+  fi
			
 
				+  sleep 1
			
 
				+done
			
 
				+
			
 
				+echo "The server is available at $HOSTNAME."
			
 
				+
			
 
				+echo "Waiting before trying to connect..."
			
 
				+sleep 30
			
 
				+
			
 
				+echo "Copying scripts."
			
 
				+
			
 
				+# Copy setup scripts
			
 
				+scp $SSH_OPTS "$bin"/hadoop-ec2-env.sh "root@$HOSTNAME:"
			
 
				+scp $SSH_OPTS "$bin"/image/hadoop-init "root@$HOSTNAME:/etc/init.d/hadoop-init"
			
 
				+scp $SSH_OPTS "$bin"/image/create-hadoop-image-remote "root@$HOSTNAME:"
			
 
				+
			
 
				+# Copy private key and certificate (for bundling image)
			
 
				+scp $SSH_OPTS $EC2_KEYDIR/pk-*.pem "root@$HOSTNAME:"
			
 
				+scp $SSH_OPTS $EC2_KEYDIR/cert-*.pem "root@$HOSTNAME:"
			
 
				+
			
 
				+# Connect to it
			
 
				+ssh $SSH_OPTS "root@$HOSTNAME" './create-hadoop-image-remote'
			
 
				+
			
 
				+# Register image
			
 
				+ec2-register $S3_BUCKET/image.manifest.xml
			
 
				+
			
 
				+echo "Terminate with: ec2-terminate-instances $BOOTING_INSTANCE"
			
--- a/src/contrib/ec2/bin/hadoop-ec2-env.sh
+++ b/src/contrib/ec2/bin/hadoop-ec2-env.sh
@@ -0,0 +1,40 @@
 
				+# Set environment variables for running Hadoop on Amazon EC2 here. All are required.
			
 
				+
			
 
				+# Your Amazon Account Number.
			
 
				+AWS_ACCOUNT_ID=
			
 
				+
			
 
				+# Your Amazon AWS access key.
			
 
				+AWS_ACCESS_KEY_ID=
			
 
				+
			
 
				+# Your Amazon AWS secret access key.
			
 
				+AWS_SECRET_ACCESS_KEY=
			
 
				+
			
 
				+# The Amazon S3 bucket where the Hadoop AMI you create will be stored.
			
 
				+S3_BUCKET=
			
 
				+
			
 
				+# SSH options used when connecting to EC2 instances.
			
 
				+# Change the -i option to be the absolute path to your keypair that you set up in the Amazon Getting Started guide.
			
 
				+SSH_OPTS='-i /home/<yourname>/id_rsa-gsg-keypair -o StrictHostKeyChecking=no'
			
 
				+
			
 
				+# Location of EC2 keys.
			
 
				+# The default setting is probably OK if you set up EC2 following the Amazon Getting Started guide.
			
 
				+EC2_KEYDIR=~/.ec2
			
 
				+
			
 
				+# The download URL for the Sun JDK. Visit http://java.sun.com/javase/downloads/index_jdk5.jsp and get the URL for the "Linux self-extracting file".
			
 
				+JAVA_BINARY_URL=''
			
 
				+
			
 
				+# The version number of the installed JDK.
			
 
				+JAVA_VERSION=1.5.0_11
			
 
				+
			
 
				+# The EC2 group to run your cluster in.
			
 
				+GROUP=hadoop-cluster-group
			
 
				+
			
 
				+# The version of Hadoop to install.
			
 
				+HADOOP_VERSION=0.10.1
			
 
				+
			
 
				+# The hostname of the master node in the cluster. You need to be able to set the DNS for this host to point to the master's IP address.
			
 
				+# See http://www.dyndns.com/services/dns/dyndns/, for example.
			
 
				+MASTER_HOST=
			
 
				+
			
 
				+# The number of nodes in your cluster.
			
 
				+NO_INSTANCES=2
			
--- a/src/contrib/ec2/bin/image/create-hadoop-image-remote
+++ b/src/contrib/ec2/bin/image/create-hadoop-image-remote
@@ -0,0 +1,47 @@
 
				+#!/bin/sh
			
 
				+# Create a Hadoop AMI. Runs on the EC2 instance.
			
 
				+
			
 
				+# Import variables
			
 
				+bin=`dirname "$0"`
			
 
				+bin=`cd "$bin"; pwd`
			
 
				+. "$bin"/hadoop-ec2-env.sh
			
 
				+
			
 
				+# Install Java
			
 
				+cd /usr/local
			
 
				+wget -nv -O java.bin $JAVA_BINARY_URL
			
 
				+sh java.bin
			
 
				+rm -f java.bin
			
 
				+
			
 
				+# Install tools
			
 
				+yum install rsync
			
 
				+
			
 
				+# Install Hadoop
			
 
				+cd /usr/local
			
 
				+wget -nv http://www.apache.org/dist/lucene/hadoop/hadoop-$HADOOP_VERSION.tar.gz
			
 
				+tar xzf hadoop-$HADOOP_VERSION.tar.gz
			
 
				+rm -f hadoop-$HADOOP_VERSION.tar.gz
			
 
				+
			
 
				+# Configure Hadoop
			
 
				+sed -i -e "s|# export JAVA_HOME=.*|export JAVA_HOME=/usr/local/jdk${JAVA_VERSION}|" \
			
 
				+       -e 's|# export HADOOP_LOG_DIR=.*|export HADOOP_LOG_DIR=/mnt/hadoop/logs|' \
			
 
				+       -e 's|# export HADOOP_SLAVE_SLEEP=.*|export HADOOP_SLAVE_SLEEP=1|' \
			
 
				+      /usr/local/hadoop-$HADOOP_VERSION/conf/hadoop-env.sh
			
 
				+mkdir -p /mnt/hadoop/logs
			
 
				+
			
 
				+# Do Hadoop configuration for master hostname and cluster size on instance startup for runlevels 3 and 4.
			
 
				+# Runlevel 4 is used by Xen. See http://developer.amazonwebservices.com/connect/message.jspa?messageID=45948#45948
			
 
				+ln -s /etc/init.d/hadoop-init /etc/rc3.d/S99hadoop-init
			
 
				+ln -s /etc/init.d/hadoop-init /etc/rc4.d/S99hadoop-init
			
 
				+
			
 
				+# Configure networking
			
 
				+ssh-keygen -t dsa -P '' -f ~/.ssh/id_dsa
			
 
				+cat ~/.ssh/id_dsa.pub >> ~/.ssh/authorized_keys
			
 
				+echo '    StrictHostKeyChecking no' >> /etc/ssh/ssh_config
			
 
				+
			
 
				+# Bundle and upload image
			
 
				+cd ~root
			
 
				+ec2-bundle-vol -d /mnt -k ~root/pk-*.pem -c ~root/cert-*.pem -u $AWS_ACCOUNT_ID -s 1536 -p hadoop-$HADOOP_VERSION
			
 
				+ec2-upload-bundle -b $S3_BUCKET -m /mnt/hadoop-$HADOOP_VERSION.manifest.xml -a $AWS_ACCESS_KEY_ID -s $AWS_SECRET_ACCESS_KEY
			
 
				+
			
 
				+# End
			
 
				+echo Done
			
--- a/src/contrib/ec2/bin/image/hadoop-init
+++ b/src/contrib/ec2/bin/image/hadoop-init
@@ -0,0 +1,73 @@
 
				+#!/bin/sh
			
 
				+
			
 
				+start() {
			
 
				+  USER_DATA=`wget -q -O - http://169.254.169.254/1.0/user-data`
			
 
				+  NO_INSTANCES=`python -c "print '$USER_DATA'.split(',')[0]"`
			
 
				+  MASTER_HOST=`python -c "print '$USER_DATA'.split(',')[1]"`
			
 
				+  HADOOP_HOME=`ls -d /usr/local/hadoop-*`
			
 
				+  echo $NO_INSTANCES, $MASTER_HOST, $HADOOP_HOME
			
 
				+  
			
 
				+  sed -i -e "s|# export HADOOP_MASTER=.*|export HADOOP_MASTER=$MASTER_HOST:$HADOOP_HOME|" \
			
 
				+      $HADOOP_HOME/conf/hadoop-env.sh
			
 
				+      
			
 
				+  cat > $HADOOP_HOME/conf/hadoop-site.xml <<EOF
			
 
				+<?xml version="1.0"?>
			
 
				+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
			
 
				+
			
 
				+<configuration>
			
 
				+
			
 
				+<property>
			
 
				+  <name>hadoop.tmp.dir</name>
			
 
				+  <value>/mnt/hadoop</value>
			
 
				+</property>
			
 
				+
			
 
				+<property>
			
 
				+  <name>fs.default.name</name>
			
 
				+  <value>$MASTER_HOST:50001</value>
			
 
				+</property>
			
 
				+
			
 
				+<property>
			
 
				+  <name>mapred.job.tracker</name>
			
 
				+  <value>$MASTER_HOST:50002</value>
			
 
				+</property>
			
 
				+
			
 
				+</configuration>
			
 
				+EOF
			
 
				+
			
 
				+  cat > $HADOOP_HOME/conf/mapred-default.xml <<EOF
			
 
				+<?xml version="1.0"?>
			
 
				+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
			
 
				+
			
 
				+<configuration>
			
 
				+
			
 
				+<property>
			
 
				+  <name>mapred.map.tasks</name>
			
 
				+  <value>$(( NO_INSTANCES * 10 ))</value>
			
 
				+</property>
			
 
				+
			
 
				+<property>
			
 
				+  <name>mapred.reduce.tasks</name>
			
 
				+  <value>$(( NO_INSTANCES * 3 ))</value>
			
 
				+</property>
			
 
				+
			
 
				+</configuration>
			
 
				+EOF
			
 
				+
			
 
				+}
			
 
				+
			
 
				+case "$1" in
			
 
				+  start)
			
 
				+        start
			
 
				+        ;;
			
 
				+  stop)
			
 
				+        ;;
			
 
				+  status)
			
 
				+        ;;
			
 
				+  restart|reload|condrestart)
			
 
				+        ;;
			
 
				+  *)
			
 
				+        echo $"Usage: $0 {start|stop|restart|reload|status}"
			
 
				+        exit 1
			
 
				+esac
			
 
				+
			
 
				+exit 0
			
--- a/src/contrib/ec2/bin/run-hadoop-cluster
+++ b/src/contrib/ec2/bin/run-hadoop-cluster
@@ -0,0 +1,60 @@
 
				+#!/bin/sh
			
 
				+# Launch an EC2 cluster of Hadoop instances and connect to the master.
			
 
				+
			
 
				+# Import variables
			
 
				+bin=`dirname "$0"`
			
 
				+bin=`cd "$bin"; pwd`
			
 
				+. "$bin"/hadoop-ec2-env.sh
			
 
				+
			
 
				+ec2-describe-group | grep $GROUP > /dev/null
			
 
				+if [ ! $? -eq 0 ]; then
			
 
				+  echo "Creating group $GROUP"
			
 
				+  ec2-add-group $GROUP -d "Group for Hadoop clusters."
			
 
				+  ec2-authorize $GROUP -p 22    # ssh
			
 
				+  ec2-authorize $GROUP -p 50030 # JobTracker web interface
			
 
				+  ec2-authorize $GROUP -p 50060 # TaskTracker web interface
			
 
				+  ec2-authorize $GROUP -o $GROUP -u $AWS_ACCOUNT_ID 
			
 
				+fi
			
 
				+
			
 
				+# Finding Hadoop image
			
 
				+AMI_IMAGE=`ec2-describe-images -a | grep $S3_BUCKET | grep available | awk '{print $2}'`
			
 
				+
			
 
				+# Start a cluster
			
 
				+echo "Starting cluster with AMI $AMI_IMAGE"
			
 
				+RUN_INSTANCES_OUTPUT=`ec2-run-instances $AMI_IMAGE -n $NO_INSTANCES -g $GROUP -k gsg-keypair -d "$NO_INSTANCES,$MASTER_HOST" | grep INSTANCE | awk '{print $2}'`
			
 
				+for instance in $RUN_INSTANCES_OUTPUT; do
			
 
				+  echo "Waiting for instance $instance to start"
			
 
				+  while true; do
			
 
				+    printf "."
			
 
				+    HOSTNAME=`ec2-describe-instances $instance | grep running | awk '{print $4}'`
			
 
				+    if [ ! -z $HOSTNAME ]; then
			
 
				+      echo "started as $HOSTNAME"
			
 
				+      break;
			
 
				+    fi
			
 
				+    sleep 1
			
 
				+  done
			
 
				+done
			
 
				+
			
 
				+echo "Appointing master"
			
 
				+MASTER_EC2_HOST=`ec2-describe-instances | grep INSTANCE | grep running | awk '{if ($7 == 0) print $4}'`
			
 
				+MASTER_IP=`dig +short $MASTER_EC2_HOST`
			
 
				+echo "Master is $MASTER_EC2_HOST. Please set up DNS so $MASTER_HOST points to $MASTER_IP then press return to continue."
			
 
				+read dummy
			
 
				+
			
 
				+echo "Waiting before trying to connect..."
			
 
				+sleep 30
			
 
				+
			
 
				+echo "Creating slaves file and copying to master"
			
 
				+ec2-describe-instances | grep INSTANCE | grep running | awk '{if ($7 != 0) print $4}' > slaves
			
 
				+scp $SSH_OPTS slaves "root@$MASTER_HOST:/usr/local/hadoop-$HADOOP_VERSION/conf/slaves"
			
 
				+
			
 
				+echo "Formatting new cluster's filesystem"
			
 
				+ssh $SSH_OPTS "root@$MASTER_HOST" "/usr/local/hadoop-$HADOOP_VERSION/bin/hadoop namenode -format"
			
 
				+
			
 
				+echo "Starting cluster"
			
 
				+ssh $SSH_OPTS "root@$MASTER_HOST" "/usr/local/hadoop-$HADOOP_VERSION/bin/start-all.sh"
			
 
				+
			
 
				+echo "Finished - check progress at http://$MASTER_HOST:50030/"
			
 
				+
			
 
				+echo "Logging in to master $MASTER_HOST."
			
 
				+ssh $SSH_OPTS "root@$MASTER_HOST"
			
--- a/src/contrib/ec2/bin/terminate-hadoop-cluster
+++ b/src/contrib/ec2/bin/terminate-hadoop-cluster
@@ -0,0 +1,9 @@
 
				+#!/bin/sh
			
 
				+# Terminate a cluster.
			
 
				+
			
 
				+# Import variables
			
 
				+bin=`dirname "$0"`
			
 
				+bin=`cd "$bin"; pwd`
			
 
				+. "$bin"/hadoop-ec2-env.sh
			
 
				+
			
 
				+ec2-terminate-instances `ec2-describe-instances | grep INSTANCE | awk '{print $2}'`