Browse Source

HADOOP-2888. Make gridmix scripts more readily configurable and amenable
to automated execution. Contributed by Mukund Madhugiri




git-svn-id: https://svn.apache.org/repos/asf/hadoop/core/trunk@636592 13f79535-47bb-0310-9956-ffa450edef68

Christopher Douglas 17 years ago
parent
commit
5c974a24f6

+ 3 - 0
CHANGES.txt

@@ -79,6 +79,9 @@ Trunk (unreleased changes)
 
     HADOOP-2765. Enables specifying ulimits for streaming/pipes tasks (ddas)
 
+    HADOOP-2888. Make gridmix scripts more readily configurable and amenable
+    to automated execution. (Mukund Madhugiri via cdouglas)
+
   OPTIMIZATIONS
 
     HADOOP-2790.  Fixed inefficient method hasSpeculativeTask by removing

+ 17 - 8
src/test/gridmix/generateData.sh

@@ -4,19 +4,28 @@ GRID_DIR=`dirname "$0"`
 GRID_DIR=`cd "$GRID_DIR"; pwd`
 source $GRID_DIR/gridmix-env
 
-# 2TB data compressing to approx 500GB
-#COMPRESSED_DATA_BYTES=2147483648000
+# Smaller data set is used by default.
 COMPRESSED_DATA_BYTES=2147483648
-# 500GB
-#UNCOMPRESSED_DATA_BYTES=536870912000
 UNCOMPRESSED_DATA_BYTES=536870912
-# Number of partitions for output data
-NUM_MAPS=100
-# Default approx 70MB per data file, compressed
-#INDIRECT_DATA_BYTES=58720256000
 INDIRECT_DATA_BYTES=58720256
+
+# Number of partitions for output data
+if [ -z ${NUM_MAPS} ] ; then
+  NUM_MAPS=100
+fi
 INDIRECT_DATA_FILES=200
 
+# If the env var USE_REAL_DATASET is set, then use the params to generate the bigger (real) dataset.
+if [ ! -z ${USE_REAL_DATASET} ] ; then
+  echo "Using real dataset"
+  # 2TB data compressing to approx 500GB
+  COMPRESSED_DATA_BYTES=2147483648000
+  # 500GB
+  UNCOMPRESSED_DATA_BYTES=536870912000
+  # Default approx 70MB per data file, compressed
+  INDIRECT_DATA_BYTES=58720256000 
+fi
+
 ${HADOOP_HOME}/bin/hadoop jar \
   ${EXAMPLE_JAR} randomtextwriter \
   -D test.randomtextwrite.total_bytes=${COMPRESSED_DATA_BYTES} \

+ 28 - 7
src/test/gridmix/gridmix-env

@@ -3,21 +3,42 @@
 
 ## Environment configuration
 # Hadoop installation
-export HADOOP_HOME=
+# set var only if it has not already been set externally
+if [ -z "${HADOOP_HOME}" ] ; then
+  export HADOOP_HOME=
+fi
 # Base directory for gridmix install
-export GRID_MIX_HOME=${GRID_DIR}
+# set var only if it has not already been set externally
+if [ -z "${GRID_MIX_HOME}" ] ; then
+  export GRID_MIX_HOME=${GRID_DIR}
+fi
 # Hadoop example jar
-export EXAMPLE_JAR=${HADOOP_HOME}/hadoop-0.15.2-dev-examples.jar
+# set var only if it has not already been set externally
+if [ -z "${EXAMPLE_JAR}" ] ; then
+  export EXAMPLE_JAR="${HADOOP_HOME}/hadoop-*-examples.jar"
+fi
 # Hadoop test jar
-export APP_JAR=${HADOOP_HOME}/hadoop-0.15.2-dev-test.jar
+# set var only if it has not already been set externally
+if [ -z "${APP_JAR}" ] ; then
+  export APP_JAR="${HADOOP_HOME}/hadoop-*-test.jar"
+fi
 # Hadoop streaming jar
-export STREAM_JAR=${HADOOP_HOME}/contrib/hadoop-0.15.2-streaming.jar
+# set var only if it has not already been set externally
+if [ -z "${STREAM_JAR}" ] ; then
+  export STREAM_JAR="${HADOOP_HOME}/contrib/hadoop-*-streaming.jar"
+fi
 # Location on default filesystem for writing gridmix data (usually HDFS)
 # Default: /gridmix/data
-export GRID_MIX_DATA=/gridmix/data
+# set var only if it has not already been set externally
+if [ -z "${GRID_MIX_DATA}" ] ; then
+  export GRID_MIX_DATA=/gridmix/data
+fi
 # Location of executables in default filesystem (usually HDFS)
 # Default: /gridmix/programs
-export GRID_MIX_PROG=/gridmix/programs
+# set var only if it has not already been set externally
+if [ -z "${GRID_MIX_PROG}" ] ; then
+  export GRID_MIX_PROG=/gridmix/programs
+fi
 
 ## Data sources
 # Variable length key, value compressed SequenceFile

+ 11 - 0
src/test/gridmix/submissionScripts/allToSameCluster

@@ -3,14 +3,25 @@
 GRID_DIR=`dirname "$0"`
 GRID_DIR=`cd "$GRID_DIR"; pwd`
 source $GRID_DIR/../gridmix-env
+PROCESSES=""
 
 $GRID_MIX_HOME/submissionScripts/maxentToSameCluster 2>&1 > maxentToSameCluster.out &
+PROCESSES="${PROCESSES} $!"
 sleep 20
 $GRID_MIX_HOME/submissionScripts/textSortToSameCluster 2>&1 > textSortToSameCluster.out  &
+PROCESSES="${PROCESSES} $!"
 sleep 20
 $GRID_MIX_HOME/submissionScripts/monsterQueriesToSameCluster 2>&1 > monsterQueriesToSameCluster.out &
+PROCESSES="${PROCESSES} $!"
 sleep 20
 $GRID_MIX_HOME/submissionScripts/webdataScanToSameCluster 2>&1 > webdataScanToSameCluster.out &
+PROCESSES="${PROCESSES} $!"
 sleep 20
 $GRID_MIX_HOME/submissionScripts/webdataSortToSameCluster  2>&1 > webdataSortToSameCluster.out &
+PROCESSES="${PROCESSES} $!"
+
+echo "Waiting for processes: ${PROCESSES}"
+for APROC in ${PROCESSES}; do
+  wait ${APROC}
+done
 

+ 7 - 0
src/test/gridmix/submissionScripts/maxentToSameCluster

@@ -3,10 +3,17 @@
 GRID_DIR=`dirname "$0"`
 GRID_DIR=`cd "$GRID_DIR"; pwd`
 source $GRID_DIR/../gridmix-env
+PROCESSES=""
 
 for ((i=0; i < $NUM_OF_LARGE_JOBS_PER_CLASS; i++))
 do
     echo $i
     $GRID_MIX_HOME/maxent/maxent.large  2>&1 > maxent.large.$i.out &
+    PROCESSES="${PROCESSES} $!"
     $GRID_MIX_HOME/submissionScripts/sleep_if_too_busy
 done
+
+for APROC in ${PROCESSES}; do
+  wait ${APROC}
+done
+

+ 7 - 0
src/test/gridmix/submissionScripts/monsterQueriesToSameCluster

@@ -3,11 +3,13 @@
 GRID_DIR=`dirname "$0"`
 GRID_DIR=`cd "$GRID_DIR"; pwd`
 source $GRID_DIR/../gridmix-env
+PROCESSES=""
 
 for ((i=0; i < $NUM_OF_SMALL_JOBS_PER_CLASS; i++))
 do
     echo $i
     $GRID_MIX_HOME/monsterQuery/monster_query.small  2>&1 > monster_query.medium.$i.out &
+    PROCESSES="${PROCESSES} $!"
     $GRID_MIX_HOME/submissionScripts/sleep_if_too_busy
 done
     
@@ -15,6 +17,7 @@ for ((i=0; i < $NUM_OF_MEDIUM_JOBS_PER_CLASS; i++))
 do
     echo $i
     $GRID_MIX_HOME/monsterQuery/monster_query.medium  2>&1 > monster_query.medium.$i.out &
+    PROCESSES="${PROCESSES} $!"
     $GRID_MIX_HOME/submissionScripts/sleep_if_too_busy
 done
 
@@ -22,6 +25,10 @@ for ((i=0; i < $NUM_OF_LARGE_JOBS_PER_CLASS; i++))
 do
     echo $i
     $GRID_MIX_HOME/monsterQuery/monster_query.large  2>&1 > monster_query.large.$i.out &
+    PROCESSES="${PROCESSES} $!"
     $GRID_MIX_HOME/submissionScripts/sleep_if_too_busy
 done
     
+for APROC in ${PROCESSES}; do
+  wait ${APROC}
+done

+ 14 - 0
src/test/gridmix/submissionScripts/textSortToSameCluster

@@ -3,15 +3,19 @@
 GRID_DIR=`dirname "$0"`
 GRID_DIR=`cd "$GRID_DIR"; pwd`
 source $GRID_DIR/../gridmix-env
+PROCESSES=""
 
 for ((i=0; i < $NUM_OF_SMALL_JOBS_PER_CLASS; i++))
 do
     echo $i
     $GRID_MIX_HOME/pipesort/text-sort.small  2>&1 > pipesort.small.$i.out &
+    PROCESSES="${PROCESSES} $!"
     $GRID_MIX_HOME/submissionScripts/sleep_if_too_busy
     $GRID_MIX_HOME/streamsort/text-sort.small  2>&1 > streamsort.small.$i.out &
+    PROCESSES="${PROCESSES} $!"
     $GRID_MIX_HOME/submissionScripts/sleep_if_too_busy
     $GRID_MIX_HOME/javasort/text-sort.small  2>&1 > javasort.small.$i.out & 
+    PROCESSES="${PROCESSES} $!"
     $GRID_MIX_HOME/submissionScripts/sleep_if_too_busy
 done
     
@@ -19,10 +23,13 @@ for ((i=0; i < $NUM_OF_MEDIUM_JOBS_PER_CLASS; i++))
 do
     echo $i
     $GRID_MIX_HOME/pipesort/text-sort.medium  2>&1 > pipesort.medium.$i.out &
+    PROCESSES="${PROCESSES} $!"
     $GRID_MIX_HOME/submissionScripts/sleep_if_too_busy
     $GRID_MIX_HOME/streamsort/text-sort.medium  2>&1 > streamsort.medium.$i.out &
+    PROCESSES="${PROCESSES} $!"
     $GRID_MIX_HOME/submissionScripts/sleep_if_too_busy
     $GRID_MIX_HOME/javasort/text-sort.medium  2>&1 > javasort.medium.$i.out & 
+    PROCESSES="${PROCESSES} $!"
     $GRID_MIX_HOME/submissionScripts/sleep_if_too_busy
 done
 
@@ -30,10 +37,17 @@ for ((i=0; i < $NUM_OF_LARGE_JOBS_PER_CLASS; i++))
 do
     echo $i
     $GRID_MIX_HOME/pipesort/text-sort.large  2>&1 > pipesort.large.$i.out &
+    PROCESSES="${PROCESSES} $!"
     $GRID_MIX_HOME/submissionScripts/sleep_if_too_busy
     $GRID_MIX_HOME/streamsort/text-sort.large  2>&1 > pipesort.large.$i.out &
+    PROCESSES="${PROCESSES} $!"
     $GRID_MIX_HOME/submissionScripts/sleep_if_too_busy
     $GRID_MIX_HOME/javasort/text-sort.large  2>&1 > pipesort.large.$i.out &
+    PROCESSES="${PROCESSES} $!"
     $GRID_MIX_HOME/submissionScripts/sleep_if_too_busy
 done
+
+for APROC in ${PROCESSES}; do
+  wait ${APROC}
+done
     

+ 7 - 1
src/test/gridmix/submissionScripts/webdataScanToSameCluster

@@ -3,11 +3,13 @@
 GRID_DIR=`dirname "$0"`
 GRID_DIR=`cd "$GRID_DIR"; pwd`
 source $GRID_DIR/../gridmix-env
+PROCESSES=""
     
 for ((i=0; i < $NUM_OF_MEDIUM_JOBS_PER_CLASS; i++))
 do
     echo $i
     $GRID_MIX_HOME/webdatascan/webdata_scan.medium  2>&1 > webdata_scan.medium.$i.out &
+    PROCESSES="${PROCESSES} $!"
     $GRID_MIX_HOME/submissionScripts/sleep_if_too_busy
 done
     
@@ -15,6 +17,7 @@ for ((i=0; i < $NUM_OF_SMALL_JOBS_PER_CLASS; i++))
 do
     echo $i
     $GRID_MIX_HOME/webdatascan/webdata_scan.small  2>&1 > webdata_scan.small.$i.out &
+    PROCESSES="${PROCESSES} $!"
     $GRID_MIX_HOME/submissionScripts/sleep_if_too_busy
 done
 
@@ -22,7 +25,10 @@ for ((i=0; i < $NUM_OF_LARGE_JOBS_PER_CLASS; i++))
 do
     echo $i
     $GRID_MIX_HOME/webdatascan/webdata_scan.large  2>&1 > webdata_scan.large.$i.out &
+    PROCESSES="${PROCESSES} $!"
     $GRID_MIX_HOME/submissionScripts/sleep_if_too_busy
 done
     
-
+for APROC in ${PROCESSES}; do
+  wait ${APROC}
+done

+ 5 - 0
src/test/gridmix/submissionScripts/webdataSortToSameCluster

@@ -3,11 +3,16 @@
 GRID_DIR=`dirname "$0"`
 GRID_DIR=`cd "$GRID_DIR"; pwd`
 source $GRID_DIR/../gridmix-env
+PROCESSES=""
 
 for ((i=0; i < $NUM_OF_LARGE_JOBS_PER_CLASS; i++))
 do
     echo $i
     $GRID_MIX_HOME/webdatasort/webdata_sort.large  2>&1 > webdata_sort.large.$i.out &
+    PROCESSES="${PROCESSES} $!"
     $GRID_MIX_HOME/submissionScripts/sleep_if_too_busy
 done
     
+for APROC in ${PROCESSES}; do
+  wait ${APROC}
+done