|
@@ -30,6 +30,8 @@ import org.apache.ambari.server.controller.PrereqCheckRequest;
|
|
|
import org.apache.ambari.server.orm.models.HostComponentSummary;
|
|
|
import org.apache.ambari.server.state.Cluster;
|
|
|
import org.apache.ambari.server.state.ComponentInfo;
|
|
|
+import org.apache.ambari.server.state.Host;
|
|
|
+import org.apache.ambari.server.state.MaintenanceState;
|
|
|
import org.apache.ambari.server.state.Service;
|
|
|
import org.apache.ambari.server.state.ServiceComponent;
|
|
|
import org.apache.ambari.server.state.StackId;
|
|
@@ -41,7 +43,24 @@ import org.apache.commons.lang.StringUtils;
|
|
|
import com.google.inject.Singleton;
|
|
|
|
|
|
/**
|
|
|
- * Checks that services are up.
|
|
|
+ * The {@link ServicesUpCheck} class is used to ensure that services are up
|
|
|
+ * "enough" before an upgrade begins. This class uses the following rules:
|
|
|
+ * <ul>
|
|
|
+ * <li>If the component is a CLIENT, it will skip it</li>
|
|
|
+ * <li>If the component is a MASTER then it must be online and not in MM</li>
|
|
|
+ * <li>If the component is a SLAVE
|
|
|
+ * <ul>
|
|
|
+ * <li>If the cardinality is 1+, then determine if {@value #SLAVE_THRESHOLD} %
|
|
|
+ * are running. Hosts in MM are counted as being "up" since they are not part of
|
|
|
+ * the upgrade.</li>
|
|
|
+ * <li>If the cardinality is 0, then all instances must be online. Hosts in MM
|
|
|
+ * are counted as being "up" since they are not part of the upgrade.</li>
|
|
|
+ * </ul>
|
|
|
+ * </ul>
|
|
|
+ * It seems counter-intuitive to have a liveliness check which allows a
|
|
|
+ * percentage of the slaves to be down. The goal is to be able to start an
|
|
|
+ * upgrade, even if some slave components on healthy hosts are down. We still
|
|
|
+ * want hosts to be scehdule for upgrade of their other components.
|
|
|
*/
|
|
|
@Singleton
|
|
|
@UpgradeCheck(group = UpgradeCheckGroup.LIVELINESS, order = 2.0f, required = true)
|
|
@@ -56,8 +75,13 @@ public class ServicesUpCheck extends AbstractCheckDescriptor {
|
|
|
super(CheckDescription.SERVICES_UP);
|
|
|
}
|
|
|
|
|
|
+ /**
|
|
|
+ * {@inheritDoc}
|
|
|
+ */
|
|
|
@Override
|
|
|
- public void perform(PrerequisiteCheck prerequisiteCheck, PrereqCheckRequest request) throws AmbariException {
|
|
|
+ public void perform(PrerequisiteCheck prerequisiteCheck, PrereqCheckRequest request)
|
|
|
+ throws AmbariException {
|
|
|
+
|
|
|
final String clusterName = request.getClusterName();
|
|
|
final Cluster cluster = clustersProvider.get().getCluster(clusterName);
|
|
|
List<String> errorMessages = new ArrayList<String>();
|
|
@@ -69,74 +93,84 @@ public class ServicesUpCheck extends AbstractCheckDescriptor {
|
|
|
final Service service = serviceEntry.getValue();
|
|
|
|
|
|
// Ignore services like Tez that are clientOnly.
|
|
|
- if (!service.isClientOnlyService()) {
|
|
|
- Map<String, ServiceComponent> serviceComponents = service.getServiceComponents();
|
|
|
+ if (service.isClientOnlyService()) {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
|
|
|
- for (Map.Entry<String, ServiceComponent> component : serviceComponents.entrySet()) {
|
|
|
+ Map<String, ServiceComponent> serviceComponents = service.getServiceComponents();
|
|
|
+ for (Map.Entry<String, ServiceComponent> component : serviceComponents.entrySet()) {
|
|
|
|
|
|
- boolean ignoreComponent = false;
|
|
|
- boolean checkThreshold = false;
|
|
|
+ ServiceComponent serviceComponent = component.getValue();
|
|
|
+ // In Services like HDFS, ignore components like HDFS Client
|
|
|
+ if (serviceComponent.isClientComponent()) {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
|
|
|
- ServiceComponent serviceComponent = component.getValue();
|
|
|
- // In Services like HDFS, ignore components like HDFS Client
|
|
|
- if (serviceComponent.isClientComponent()) {
|
|
|
- ignoreComponent = true;
|
|
|
+ // skip if the component is not part of the finalization version check
|
|
|
+ if (!serviceComponent.isVersionAdvertised()) {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+
|
|
|
+ // TODO, add more logic that checks the Upgrade Pack.
|
|
|
+ // These components are not in the upgrade pack and do not advertise a
|
|
|
+ // version:
|
|
|
+ // ZKFC, Ambari Metrics, Kerberos, Atlas (right now).
|
|
|
+ // Generally, if it advertises a version => in the upgrade pack.
|
|
|
+ // So it can be in the Upgrade Pack but not advertise a version.
|
|
|
+ List<HostComponentSummary> hostComponentSummaries = HostComponentSummary.getHostComponentSummaries(
|
|
|
+ service.getName(), serviceComponent.getName());
|
|
|
+
|
|
|
+ // not installed, nothing to do
|
|
|
+ if (hostComponentSummaries.isEmpty()) {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+
|
|
|
+ // non-master, "true" slaves with cardinality 1+
|
|
|
+ boolean checkThreshold = false;
|
|
|
+ if (!serviceComponent.isMasterComponent()) {
|
|
|
+ ComponentInfo componentInfo = ambariMetaInfo.get().getComponent(stackId.getStackName(),
|
|
|
+ stackId.getStackVersion(), serviceComponent.getServiceName(),
|
|
|
+ serviceComponent.getName());
|
|
|
+
|
|
|
+ String cardinality = componentInfo.getCardinality();
|
|
|
+ // !!! check if there can be more than one. This will match, say,
|
|
|
+ // datanodes but not ZKFC
|
|
|
+ if (null != cardinality
|
|
|
+ && (cardinality.equals("ALL") || cardinality.matches("[1-9].*"))) {
|
|
|
+ checkThreshold = true;
|
|
|
}
|
|
|
+ }
|
|
|
+
|
|
|
+ // check threshold for slaves which have a non-0 cardinality
|
|
|
+ if (checkThreshold) {
|
|
|
+ int total = hostComponentSummaries.size();
|
|
|
+ int up = 0;
|
|
|
+ int down = 0;
|
|
|
|
|
|
- // non-master, "true" slaves with cardinality 1+
|
|
|
- if (!ignoreComponent && !serviceComponent.isMasterComponent()) {
|
|
|
- ComponentInfo componentInfo = ambariMetaInfo.get().getComponent(
|
|
|
- stackId.getStackName(), stackId.getStackVersion(),
|
|
|
- serviceComponent.getServiceName(), serviceComponent.getName());
|
|
|
-
|
|
|
- String cardinality = componentInfo.getCardinality();
|
|
|
- // !!! check if there can be more than one. This will match, say, datanodes but not ZKFC
|
|
|
- if (null != cardinality &&
|
|
|
- (cardinality.equals("ALL") || cardinality.matches("[1-9].*"))) {
|
|
|
- checkThreshold = true;
|
|
|
+ for (HostComponentSummary summary : hostComponentSummaries) {
|
|
|
+ if (isConsideredDown(cluster, serviceComponent, summary)) {
|
|
|
+ down++;
|
|
|
+ } else {
|
|
|
+ up++;
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- if (!serviceComponent.isVersionAdvertised()) {
|
|
|
- ignoreComponent = true;
|
|
|
+ if ((float) down / total > SLAVE_THRESHOLD) { // arbitrary
|
|
|
+ failedServiceNames.add(service.getName());
|
|
|
+ String message = MessageFormat.format(
|
|
|
+ "{0}: {1} out of {2} {3} are started; there should be {4,number,percent} started before upgrading.",
|
|
|
+ service.getName(), up, total, serviceComponent.getName(), SLAVE_THRESHOLD);
|
|
|
+ errorMessages.add(message);
|
|
|
}
|
|
|
-
|
|
|
- // TODO, add more logic that checks the Upgrade Pack.
|
|
|
- // These components are not in the upgrade pack and do not advertise a version:
|
|
|
- // ZKFC, Ambari Metrics, Kerberos, Atlas (right now).
|
|
|
- // Generally, if it advertises a version => in the upgrade pack.
|
|
|
- // So it can be in the Upgrade Pack but not advertise a version.
|
|
|
- if (!ignoreComponent) {
|
|
|
- List<HostComponentSummary> hostComponentSummaries = HostComponentSummary.getHostComponentSummaries(service.getName(), serviceComponent.getName());
|
|
|
-
|
|
|
- if (checkThreshold && !hostComponentSummaries.isEmpty()) {
|
|
|
- int total = hostComponentSummaries.size();
|
|
|
- int up = 0;
|
|
|
- int down = 0;
|
|
|
-
|
|
|
- for(HostComponentSummary s : hostComponentSummaries) {
|
|
|
- if ((s.getDesiredState() == State.INSTALLED || s.getDesiredState() == State.STARTED) && State.STARTED != s.getCurrentState()) {
|
|
|
- down++;
|
|
|
- } else {
|
|
|
- up++;
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- if ((float) down/total > SLAVE_THRESHOLD) { // arbitrary
|
|
|
- failedServiceNames.add(service.getName());
|
|
|
- String message = MessageFormat.format("{0}: {1} out of {2} {3} are started; there should be {4,number,percent} started before upgrading.",
|
|
|
- service.getName(), up, total, serviceComponent.getName(), SLAVE_THRESHOLD);
|
|
|
- errorMessages.add(message);
|
|
|
- }
|
|
|
- } else {
|
|
|
- for(HostComponentSummary s : hostComponentSummaries) {
|
|
|
- if ((s.getDesiredState() == State.INSTALLED || s.getDesiredState() == State.STARTED) && State.STARTED != s.getCurrentState()) {
|
|
|
- failedServiceNames.add(service.getName());
|
|
|
- String message = MessageFormat.format("{0}: {1} (in {2} on host {3})", service.getName(), serviceComponent.getName(), s.getCurrentState(), s.getHostName());
|
|
|
- errorMessages.add(message);
|
|
|
- break;
|
|
|
- }
|
|
|
- }
|
|
|
+ } else {
|
|
|
+ for (HostComponentSummary summary : hostComponentSummaries) {
|
|
|
+ if (isConsideredDown(cluster, serviceComponent, summary)) {
|
|
|
+ failedServiceNames.add(service.getName());
|
|
|
+ String message = MessageFormat.format("{0}: {1} (in {2} on host {3})",
|
|
|
+ service.getName(), serviceComponent.getName(), summary.getCurrentState(),
|
|
|
+ summary.getHostName());
|
|
|
+ errorMessages.add(message);
|
|
|
+ break;
|
|
|
}
|
|
|
}
|
|
|
}
|
|
@@ -146,8 +180,48 @@ public class ServicesUpCheck extends AbstractCheckDescriptor {
|
|
|
if (!errorMessages.isEmpty()) {
|
|
|
prerequisiteCheck.setFailedOn(new LinkedHashSet<String>(failedServiceNames));
|
|
|
prerequisiteCheck.setStatus(PrereqCheckStatus.FAIL);
|
|
|
- prerequisiteCheck.setFailReason("The following Service Components should be in a started state. Please invoke a service Stop and full Start and try again. " + StringUtils.join(errorMessages, ", "));
|
|
|
+ prerequisiteCheck.setFailReason(
|
|
|
+ "The following Service Components should be in a started state. Please invoke a service Stop and full Start and try again. "
|
|
|
+ + StringUtils.join(errorMessages, ", "));
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+ /**
|
|
|
+ * Gets whether this component should be considered as being "down" for the
|
|
|
+ * purposes of this check. Component type, maintenance mode, and state are
|
|
|
+ * taken into account.
|
|
|
+ *
|
|
|
+ * @param clusters
|
|
|
+ * the clusters instance
|
|
|
+ * @param cluster
|
|
|
+ * the cluster
|
|
|
+ * @param serviceComponent
|
|
|
+ * the component
|
|
|
+ * @param summary
|
|
|
+ * a summary of the state of the component on a host
|
|
|
+ * @return {@code true} if the host component should be considered as failing
|
|
|
+ * this test.
|
|
|
+ * @throws AmbariException
|
|
|
+ */
|
|
|
+ private boolean isConsideredDown(Cluster cluster, ServiceComponent serviceComponent,
|
|
|
+ HostComponentSummary summary) throws AmbariException {
|
|
|
+ Host host = clustersProvider.get().getHostById(summary.getHostId());
|
|
|
+ MaintenanceState maintenanceState = host.getMaintenanceState(cluster.getClusterId());
|
|
|
+
|
|
|
+ // non-MASTER components in maintenance mode should not count
|
|
|
+ if (maintenanceState == MaintenanceState.ON && !serviceComponent.isMasterComponent()) {
|
|
|
+ return false;
|
|
|
+ }
|
|
|
+
|
|
|
+ State desiredState = summary.getDesiredState();
|
|
|
+ State currentState = summary.getCurrentState();
|
|
|
+
|
|
|
+ switch (desiredState) {
|
|
|
+ case INSTALLED:
|
|
|
+ case STARTED:
|
|
|
+ return currentState != State.STARTED;
|
|
|
+ default:
|
|
|
+ return false;
|
|
|
+ }
|
|
|
+ }
|
|
|
}
|