Bladeren bron

AMBARI-2099. Cluster install failed due to timeout and the user can proceed to cluster management; the user was not presented an option to retry install. (yusaku)

git-svn-id: https://svn.apache.org/repos/asf/incubator/ambari/trunk@1481142 13f79535-47bb-0310-9956-ffa450edef68
Yusaku Sako 12 jaren geleden
bovenliggende
commit
9d560aad82

+ 4 - 0
CHANGES.txt

@@ -837,6 +837,10 @@ Trunk (unreleased changes):
 
  BUG FIXES
 
+ AMBARI-2099. Cluster install failed due to timeout and the user can proceed
+ to cluster management; the user was not presented an option to retry install.
+ (yusaku)
+
  AMBARI-2101. Hive service check (still) failing with file permissions.
  (swagle)
 

File diff suppressed because it is too large
+ 11 - 0
ambari-web/app/assets/data/wizard/deploy/4_hosts_install_fail_without_failed_status/poll_1.json


+ 18 - 2
ambari-web/app/controllers/wizard/step9_controller.js

@@ -416,6 +416,8 @@ App.WizardStep9Controller = Em.Controller.extend({
   },
 
   // marks a host's status as "warning" if at least one of the tasks is FAILED, ABORTED, or TIMEDOUT and marks host's status as "failed" if at least one master component install task is FAILED.
+  // note that if the master failed to install because of ABORTED or TIMEDOUT, we don't mark it as failed, because this would mark all hosts as "failed" and makes it difficult for the user
+  // to find which host FAILED occurred on, if any
   onErrorPerHost: function (actions, contentHost) {
     if (actions.someProperty('Tasks.status', 'FAILED') || actions.someProperty('Tasks.status', 'ABORTED') || actions.someProperty('Tasks.status', 'TIMEDOUT')) {
       contentHost.set('status', 'warning');
@@ -492,19 +494,33 @@ App.WizardStep9Controller = Em.Controller.extend({
     return polledData.everyProperty('Tasks.status', 'COMPLETED');
   },
 
-  //return true if at least 50% of the slave host components for the particular service component fails to install
+  /**
+   * return true if:
+   *  1. any of the master/client components failed to install
+   *  OR
+   *  2. at least 50% of the slave host components for the particular service component fails to install
+   */
   isStepFailed: function () {
     var failed = false;
     var polledData = this.get('polledData');
     polledData.filterProperty('Tasks.command', 'INSTALL').mapProperty('Tasks.role').uniq().forEach(function (role) {
+      if (failed) {
+        return;
+      }
+      var actionsPerRole = polledData.filterProperty('Tasks.role', role);
       if (['DATANODE', 'TASKTRACKER', 'HBASE_REGIONSERVER', 'GANGLIA_MONITOR'].contains(role)) {
-        var actionsPerRole = polledData.filterProperty('Tasks.role', role);
+        // check slave components for success factor.
+        // partial failure for slave components are allowed.
         var actionsFailed = actionsPerRole.filterProperty('Tasks.status', 'FAILED');
         var actionsAborted = actionsPerRole.filterProperty('Tasks.status', 'ABORTED');
         var actionsTimedOut = actionsPerRole.filterProperty('Tasks.status', 'TIMEDOUT');
         if ((((actionsFailed.length + actionsAborted.length + actionsTimedOut.length) / actionsPerRole.length) * 100) > 50) {
           failed = true;
         }
+      } else if (actionsPerRole.someProperty('Tasks.status', 'FAILED') || actionsPerRole.someProperty('Tasks.status', 'ABORTED') ||
+        actionsPerRole.someProperty('Tasks.status', 'TIMEDOUT')) {
+        // check non-salve components (i.e., masters and clients).  all of these must be successfully installed.
+        failed = true;
       }
     }, this);
     return failed;

Some files were not shown because too many files changed in this diff