|
@@ -280,6 +280,7 @@ public class NodesListManager extends CompositeService implements
|
|
|
StringUtils.join(",", hostsReader.getExcludedHosts()) + "}");
|
|
|
|
|
|
handleExcludeNodeList(graceful, timeout);
|
|
|
+ markUnregisteredNodesAsLost(yarnConf);
|
|
|
}
|
|
|
|
|
|
private void setDecommissionedNMs() {
|
|
@@ -387,6 +388,115 @@ public class NodesListManager extends CompositeService implements
|
|
|
updateInactiveNodes();
|
|
|
}
|
|
|
|
|
|
+ /**
|
|
|
+ * Marks the unregistered nodes as LOST
|
|
|
+ * if the feature is enabled via a configuration flag.
|
|
|
+ *
|
|
|
+ * This method finds nodes that are present in the include list but are not
|
|
|
+ * registered with the ResourceManager. Such nodes are then marked as LOST.
|
|
|
+ *
|
|
|
+ * The steps are as follows:
|
|
|
+ * 1. Retrieve all hostnames of registered nodes from RM.
|
|
|
+ * 2. Identify the nodes present in the include list but are not registered
|
|
|
+ * 3. Remove nodes from the exclude list
|
|
|
+ * 4. Dispatch LOST events for filtered nodes to mark them as LOST.
|
|
|
+ *
|
|
|
+ * @param yarnConf Configuration object that holds the YARN configurations.
|
|
|
+ */
|
|
|
+ private void markUnregisteredNodesAsLost(Configuration yarnConf) {
|
|
|
+ // Check if tracking unregistered nodes is enabled in the configuration
|
|
|
+ if (!yarnConf.getBoolean(YarnConfiguration.ENABLE_TRACKING_FOR_UNREGISTERED_NODES,
|
|
|
+ YarnConfiguration.DEFAULT_ENABLE_TRACKING_FOR_UNREGISTERED_NODES)) {
|
|
|
+ LOG.debug("Unregistered node tracking is disabled. " +
|
|
|
+ "Skipping marking unregistered nodes as LOST.");
|
|
|
+ return;
|
|
|
+ }
|
|
|
+
|
|
|
+ // Set to store all registered hostnames from both active and inactive lists
|
|
|
+ Set<String> registeredHostNames = gatherRegisteredHostNames();
|
|
|
+ // Event handler to dispatch LOST events
|
|
|
+ EventHandler eventHandler = this.rmContext.getDispatcher().getEventHandler();
|
|
|
+
|
|
|
+ // Identify nodes that are in the include list but are not registered
|
|
|
+ // and are not in the exclude list
|
|
|
+ List<String> nodesToMarkLost = new ArrayList<>();
|
|
|
+ HostDetails hostDetails = hostsReader.getHostDetails();
|
|
|
+ Set<String> includes = hostDetails.getIncludedHosts();
|
|
|
+ Set<String> excludes = hostDetails.getExcludedHosts();
|
|
|
+
|
|
|
+ for (String includedNode : includes) {
|
|
|
+ if (!registeredHostNames.contains(includedNode) && !excludes.contains(includedNode)) {
|
|
|
+ LOG.info("Lost node: {}", includedNode);
|
|
|
+ nodesToMarkLost.add(includedNode);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ // Dispatch LOST events for the identified lost nodes
|
|
|
+ for (String lostNode : nodesToMarkLost) {
|
|
|
+ dispatchLostEvent(eventHandler, lostNode);
|
|
|
+ }
|
|
|
+
|
|
|
+ // Log successful completion of marking unregistered nodes as LOST
|
|
|
+ LOG.info("Successfully marked unregistered nodes as LOST");
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * Gathers all registered hostnames from both active and inactive RMNodes.
|
|
|
+ *
|
|
|
+ * @return A set of registered hostnames.
|
|
|
+ */
|
|
|
+ private Set<String> gatherRegisteredHostNames() {
|
|
|
+ Set<String> registeredHostNames = new HashSet<>();
|
|
|
+ LOG.info("Getting all the registered hostnames");
|
|
|
+
|
|
|
+ // Gather all registered nodes (active) from RM into the set
|
|
|
+ for (RMNode node : this.rmContext.getRMNodes().values()) {
|
|
|
+ registeredHostNames.add(node.getHostName());
|
|
|
+ }
|
|
|
+
|
|
|
+ // Gather all inactive nodes from RM into the set
|
|
|
+ for (RMNode node : this.rmContext.getInactiveRMNodes().values()) {
|
|
|
+ registeredHostNames.add(node.getHostName());
|
|
|
+ }
|
|
|
+
|
|
|
+ return registeredHostNames;
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * Dispatches a LOST event for a specified lost node.
|
|
|
+ *
|
|
|
+ * @param eventHandler The EventHandler used to dispatch the LOST event.
|
|
|
+ * @param lostNode The hostname of the lost node for which the event is
|
|
|
+ * being dispatched.
|
|
|
+ */
|
|
|
+ private void dispatchLostEvent(EventHandler eventHandler, String lostNode) {
|
|
|
+ // Generate a NodeId for the lost node with a special port -2
|
|
|
+ NodeId nodeId = createLostNodeId(lostNode);
|
|
|
+ RMNodeEvent lostEvent = new RMNodeEvent(nodeId, RMNodeEventType.EXPIRE);
|
|
|
+ RMNodeImpl rmNode = new RMNodeImpl(nodeId, this.rmContext, lostNode, -2, -2,
|
|
|
+ new UnknownNode(lostNode), Resource.newInstance(0, 0), "unknown");
|
|
|
+
|
|
|
+ try {
|
|
|
+ // Dispatch the LOST event to signal the node is no longer active
|
|
|
+ eventHandler.handle(lostEvent);
|
|
|
+
|
|
|
+ // After successful dispatch, update the node status in RMContext
|
|
|
+ // Set the node's timestamp for when it became untracked
|
|
|
+ rmNode.setUntrackedTimeStamp(Time.monotonicNow());
|
|
|
+
|
|
|
+ // Add the node to the active and inactive node maps in RMContext
|
|
|
+ this.rmContext.getRMNodes().put(nodeId, rmNode);
|
|
|
+ this.rmContext.getInactiveRMNodes().put(nodeId, rmNode);
|
|
|
+
|
|
|
+ LOG.info("Successfully dispatched LOST event and deactivated node: {}, Node ID: {}",
|
|
|
+ lostNode, nodeId);
|
|
|
+ } catch (Exception e) {
|
|
|
+ // Log any exception encountered during event dispatch
|
|
|
+ LOG.error("Error dispatching LOST event for node: {}, Node ID: {} - {}",
|
|
|
+ lostNode, nodeId, e.getMessage());
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
@VisibleForTesting
|
|
|
public int getNodeRemovalCheckInterval() {
|
|
|
return nodeRemovalCheckInterval;
|
|
@@ -711,6 +821,20 @@ public class NodesListManager extends CompositeService implements
|
|
|
return NodeId.newInstance(host, -1);
|
|
|
}
|
|
|
|
|
|
+ /**
|
|
|
+ * Creates a NodeId for a node marked as LOST.
|
|
|
+ *
|
|
|
+ * The NodeId combines the hostname with a special port value of -2, indicating
|
|
|
+ * that the node is lost in the cluster.
|
|
|
+ *
|
|
|
+ * @param host The hostname of the lost node.
|
|
|
+ * @return NodeId Unique identifier for the lost node, with the port set to -2.
|
|
|
+ */
|
|
|
+ public static NodeId createLostNodeId(String host) {
|
|
|
+ // Create a NodeId with the given host and port -2 to signify the node is lost.
|
|
|
+ return NodeId.newInstance(host, -2);
|
|
|
+ }
|
|
|
+
|
|
|
/**
|
|
|
* A Node instance needed upon startup for populating inactive nodes Map.
|
|
|
* It only knows its hostname/ip.
|