|
@@ -30,6 +30,7 @@ import java.util.concurrent.locks.ReentrantReadWriteLock;
|
|
import java.util.concurrent.locks.ReentrantReadWriteLock.ReadLock;
|
|
import java.util.concurrent.locks.ReentrantReadWriteLock.ReadLock;
|
|
import java.util.concurrent.locks.ReentrantReadWriteLock.WriteLock;
|
|
import java.util.concurrent.locks.ReentrantReadWriteLock.WriteLock;
|
|
|
|
|
|
|
|
+import org.apache.commons.collections.CollectionUtils;
|
|
import org.apache.commons.logging.Log;
|
|
import org.apache.commons.logging.Log;
|
|
import org.apache.commons.logging.LogFactory;
|
|
import org.apache.commons.logging.LogFactory;
|
|
import org.apache.hadoop.conf.Configuration;
|
|
import org.apache.hadoop.conf.Configuration;
|
|
@@ -77,6 +78,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeImpl;
|
|
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeReconnectEvent;
|
|
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeReconnectEvent;
|
|
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeStartedEvent;
|
|
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeStartedEvent;
|
|
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeStatusEvent;
|
|
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeStatusEvent;
|
|
|
|
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeRemovedSchedulerEvent;
|
|
import org.apache.hadoop.yarn.server.resourcemanager.security.NMTokenSecretManagerInRM;
|
|
import org.apache.hadoop.yarn.server.resourcemanager.security.NMTokenSecretManagerInRM;
|
|
import org.apache.hadoop.yarn.server.resourcemanager.security.RMContainerTokenSecretManager;
|
|
import org.apache.hadoop.yarn.server.resourcemanager.security.RMContainerTokenSecretManager;
|
|
import org.apache.hadoop.yarn.server.resourcemanager.security.authorize.RMPolicyProvider;
|
|
import org.apache.hadoop.yarn.server.resourcemanager.security.authorize.RMPolicyProvider;
|
|
@@ -386,14 +388,37 @@ public class ResourceTrackerService extends AbstractService implements
|
|
} else {
|
|
} else {
|
|
LOG.info("Reconnect from the node at: " + host);
|
|
LOG.info("Reconnect from the node at: " + host);
|
|
this.nmLivelinessMonitor.unregister(nodeId);
|
|
this.nmLivelinessMonitor.unregister(nodeId);
|
|
- // Reset heartbeat ID since node just restarted.
|
|
|
|
- oldNode.resetLastNodeHeartBeatResponse();
|
|
|
|
- this.rmContext
|
|
|
|
- .getDispatcher()
|
|
|
|
- .getEventHandler()
|
|
|
|
- .handle(
|
|
|
|
- new RMNodeReconnectEvent(nodeId, rmNode, request
|
|
|
|
- .getRunningApplications(), request.getNMContainerStatuses()));
|
|
|
|
|
|
+
|
|
|
|
+ if (CollectionUtils.isEmpty(request.getRunningApplications())
|
|
|
|
+ && rmNode.getState() != NodeState.DECOMMISSIONING
|
|
|
|
+ && rmNode.getHttpPort() != oldNode.getHttpPort()) {
|
|
|
|
+ // Reconnected node differs, so replace old node and start new node
|
|
|
|
+ switch (rmNode.getState()) {
|
|
|
|
+ case RUNNING:
|
|
|
|
+ ClusterMetrics.getMetrics().decrNumActiveNodes();
|
|
|
|
+ break;
|
|
|
|
+ case UNHEALTHY:
|
|
|
|
+ ClusterMetrics.getMetrics().decrNumUnhealthyNMs();
|
|
|
|
+ break;
|
|
|
|
+ default:
|
|
|
|
+ LOG.debug("Unexpected Rmnode state");
|
|
|
|
+ }
|
|
|
|
+ this.rmContext.getDispatcher().getEventHandler()
|
|
|
|
+ .handle(new NodeRemovedSchedulerEvent(rmNode));
|
|
|
|
+
|
|
|
|
+ this.rmContext.getRMNodes().put(nodeId, rmNode);
|
|
|
|
+ this.rmContext.getDispatcher().getEventHandler()
|
|
|
|
+ .handle(new RMNodeStartedEvent(nodeId, null, null));
|
|
|
|
+
|
|
|
|
+ } else {
|
|
|
|
+ // Reset heartbeat ID since node just restarted.
|
|
|
|
+ oldNode.resetLastNodeHeartBeatResponse();
|
|
|
|
+
|
|
|
|
+ this.rmContext.getDispatcher().getEventHandler()
|
|
|
|
+ .handle(new RMNodeReconnectEvent(nodeId, rmNode,
|
|
|
|
+ request.getRunningApplications(),
|
|
|
|
+ request.getNMContainerStatuses()));
|
|
|
|
+ }
|
|
}
|
|
}
|
|
// On every node manager register we will be clearing NMToken keys if
|
|
// On every node manager register we will be clearing NMToken keys if
|
|
// present for any running application.
|
|
// present for any running application.
|
|
@@ -490,12 +515,13 @@ public class ResourceTrackerService extends AbstractService implements
|
|
|
|
|
|
// 3. Check if it's a 'fresh' heartbeat i.e. not duplicate heartbeat
|
|
// 3. Check if it's a 'fresh' heartbeat i.e. not duplicate heartbeat
|
|
NodeHeartbeatResponse lastNodeHeartbeatResponse = rmNode.getLastNodeHeartBeatResponse();
|
|
NodeHeartbeatResponse lastNodeHeartbeatResponse = rmNode.getLastNodeHeartBeatResponse();
|
|
- if (remoteNodeStatus.getResponseId() + 1 == lastNodeHeartbeatResponse
|
|
|
|
- .getResponseId()) {
|
|
|
|
|
|
+ if (getNextResponseId(
|
|
|
|
+ remoteNodeStatus.getResponseId()) == lastNodeHeartbeatResponse
|
|
|
|
+ .getResponseId()) {
|
|
LOG.info("Received duplicate heartbeat from node "
|
|
LOG.info("Received duplicate heartbeat from node "
|
|
+ rmNode.getNodeAddress()+ " responseId=" + remoteNodeStatus.getResponseId());
|
|
+ rmNode.getNodeAddress()+ " responseId=" + remoteNodeStatus.getResponseId());
|
|
return lastNodeHeartbeatResponse;
|
|
return lastNodeHeartbeatResponse;
|
|
- } else if (remoteNodeStatus.getResponseId() + 1 < lastNodeHeartbeatResponse
|
|
|
|
|
|
+ } else if (remoteNodeStatus.getResponseId() != lastNodeHeartbeatResponse
|
|
.getResponseId()) {
|
|
.getResponseId()) {
|
|
String message =
|
|
String message =
|
|
"Too far behind rm response id:"
|
|
"Too far behind rm response id:"
|
|
@@ -510,13 +536,11 @@ public class ResourceTrackerService extends AbstractService implements
|
|
}
|
|
}
|
|
|
|
|
|
// Heartbeat response
|
|
// Heartbeat response
|
|
- NodeHeartbeatResponse nodeHeartBeatResponse = YarnServerBuilderUtils
|
|
|
|
- .newNodeHeartbeatResponse(lastNodeHeartbeatResponse.
|
|
|
|
- getResponseId() + 1, NodeAction.NORMAL, null, null, null, null,
|
|
|
|
- nextHeartBeatInterval);
|
|
|
|
- rmNode.updateNodeHeartbeatResponseForCleanup(nodeHeartBeatResponse);
|
|
|
|
- rmNode.updateNodeHeartbeatResponseForContainersDecreasing(
|
|
|
|
- nodeHeartBeatResponse);
|
|
|
|
|
|
+ NodeHeartbeatResponse nodeHeartBeatResponse =
|
|
|
|
+ YarnServerBuilderUtils.newNodeHeartbeatResponse(
|
|
|
|
+ getNextResponseId(lastNodeHeartbeatResponse.getResponseId()),
|
|
|
|
+ NodeAction.NORMAL, null, null, null, null, nextHeartBeatInterval);
|
|
|
|
+ rmNode.setAndUpdateNodeHeartbeatResponse(nodeHeartBeatResponse);
|
|
|
|
|
|
populateKeys(request, nodeHeartBeatResponse);
|
|
populateKeys(request, nodeHeartBeatResponse);
|
|
|
|
|
|
@@ -528,7 +552,7 @@ public class ResourceTrackerService extends AbstractService implements
|
|
|
|
|
|
// 4. Send status to RMNode, saving the latest response.
|
|
// 4. Send status to RMNode, saving the latest response.
|
|
RMNodeStatusEvent nodeStatusEvent =
|
|
RMNodeStatusEvent nodeStatusEvent =
|
|
- new RMNodeStatusEvent(nodeId, remoteNodeStatus, nodeHeartBeatResponse);
|
|
|
|
|
|
+ new RMNodeStatusEvent(nodeId, remoteNodeStatus);
|
|
if (request.getLogAggregationReportsForApps() != null
|
|
if (request.getLogAggregationReportsForApps() != null
|
|
&& !request.getLogAggregationReportsForApps().isEmpty()) {
|
|
&& !request.getLogAggregationReportsForApps().isEmpty()) {
|
|
nodeStatusEvent.setLogAggregationReportsForApps(request
|
|
nodeStatusEvent.setLogAggregationReportsForApps(request
|
|
@@ -562,6 +586,11 @@ public class ResourceTrackerService extends AbstractService implements
|
|
return nodeHeartBeatResponse;
|
|
return nodeHeartBeatResponse;
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+ private int getNextResponseId(int responseId) {
|
|
|
|
+ // Loop between 0 and Integer.MAX_VALUE
|
|
|
|
+ return (responseId + 1) & Integer.MAX_VALUE;
|
|
|
|
+ }
|
|
|
|
+
|
|
/**
|
|
/**
|
|
* Check if node in decommissioning state.
|
|
* Check if node in decommissioning state.
|
|
* @param nodeId
|
|
* @param nodeId
|