Forráskód Böngészése

HADOOP-11406. xargs -P is not portable (Kengo Seki via aw)

Allen Wittenauer 10 éve
szülő
commit
5504a261f8

+ 2 - 0
hadoop-common-project/hadoop-common/CHANGES.txt

@@ -466,6 +466,8 @@ Trunk (Unreleased)
     HADOOP-9891. CLIMiniCluster instructions fail with MiniYarnCluster
     ClassNotFoundException (Darrell Taylor via aw)
 
+    HADOOP-11406. xargs -P is not portable (Kengo Seki via aw)
+
   OPTIMIZATIONS
 
     HADOOP-7761. Improve the performance of raw comparisons. (todd)

+ 24 - 18
hadoop-common-project/hadoop-common/src/main/bin/hadoop-functions.sh

@@ -461,27 +461,33 @@ function hadoop_connect_to_hosts
     if [[ -z "${SLAVE_NAMES}" ]]; then
       SLAVE_NAMES=$(sed 's/#.*$//;/^$/d' "${SLAVE_FILE}")
     fi
-
-    # quoting here gets tricky. it's easier to push it into a function
-    # so that we don't have to deal with it. However...
-    # xargs can't use a function so instead we'll export it out
-    # and force it into a subshell
-    # moral of the story: just use pdsh.
-    export -f hadoop_actual_ssh
-    export HADOOP_SSH_OPTS
-
-    # xargs is used with option -I to replace the placeholder in arguments
-    # list with each hostname read from stdin/pipe. But it consider one
-    # line as one argument while reading from stdin/pipe. So place each
-    # hostname in different lines while passing via pipe.
-    SLAVE_NAMES=$(echo "$SLAVE_NAMES" | tr ' ' '\n' )
-    echo "${SLAVE_NAMES}" | \
-    xargs -n 1 -P"${HADOOP_SSH_PARALLEL}" \
-    -I {} bash -c --  "hadoop_actual_ssh {} ${params}"
-    wait
+    hadoop_connect_to_hosts_without_pdsh "${params}"
   fi
 }
 
+## @description  Connect to ${SLAVE_NAMES} and execute command
+## @description  under the environment which does not support pdsh.
+## @audience     private
+## @stability    evolving
+## @replaceable  yes
+## @param        command
+## @param        [...]
+function hadoop_connect_to_hosts_without_pdsh
+{
+  # shellcheck disable=SC2124
+  local params="$@"
+  local slaves=(${SLAVE_NAMES})
+  for (( i = 0; i < ${#slaves[@]}; i++ ))
+  do
+    if (( i != 0 && i % HADOOP_SSH_PARALLEL == 0 )); then
+      wait
+    fi
+    # shellcheck disable=SC2086
+    hadoop_actual_ssh "${slaves[$i]}" ${params} &
+  done
+  wait
+}
+
 ## @description  Utility routine to handle --slaves mode
 ## @audience     private
 ## @stability    evolving

+ 28 - 1
hadoop-common-project/hadoop-common/src/main/conf/hadoop-user-functions.sh.example

@@ -50,7 +50,7 @@
 #
 
 #
-# Another example:  finding java
+# Example:  finding java
 #
 # By default, Hadoop assumes that $JAVA_HOME is always defined
 # outside of its configuration. Eons ago, Apple standardized
@@ -85,3 +85,30 @@
 #    exit 1
 #  fi
 #}
+
+#
+# Example:  efficient command execution for the slaves
+#
+# To improve performance, you can use xargs -P
+# instead of the for loop, if supported.
+#
+#function hadoop_connect_to_hosts_without_pdsh
+#{
+#  # quoting here gets tricky. it's easier to push it into a function
+#  # so that we don't have to deal with it. However...
+#  # xargs can't use a function so instead we'll export it out
+#  # and force it into a subshell
+#  # moral of the story: just use pdsh.
+#  export -f hadoop_actual_ssh
+#  export HADOOP_SSH_OPTS
+#
+#  # xargs is used with option -I to replace the placeholder in arguments
+#  # list with each hostname read from stdin/pipe. But it consider one
+#  # line as one argument while reading from stdin/pipe. So place each
+#  # hostname in different lines while passing via pipe.
+#  SLAVE_NAMES=$(echo "$SLAVE_NAMES" | tr ' ' '\n' )
+#  echo "${SLAVE_NAMES}" | \
+#  xargs -n 1 -P"${HADOOP_SSH_PARALLEL}" \
+#  -I {} bash -c --  "hadoop_actual_ssh {} ${params}"
+#  wait
+#}