Browse Source

YARN-11198. clean up numa resources from statestore (#4546)

* YARN-11198. clean up numa resources from levelDB

Co-authored-by: Deb <dbsamrat@3c22fba1b03f.ant.amazon.com>
Samrat 2 years ago
parent
commit
84ce592a85

+ 10 - 2
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/numa/NumaResourceAllocator.java

@@ -231,7 +231,7 @@ public class NumaResourceAllocator {
   }
 
   private NumaResourceAllocation allocate(ContainerId containerId,
-      Resource resource) {
+      Resource resource) throws ResourceHandlerException {
     for (int index = 0; index < numaNodesList.size(); index++) {
       NumaNodeResource numaNode = numaNodesList
           .get((currentAssignNode + index) % numaNodesList.size());
@@ -306,12 +306,20 @@ public class NumaResourceAllocator {
    * Release assigned NUMA resources for the container.
    *
    * @param containerId the container ID
+   * @throws ResourceHandlerException when failed to release numa resource
    */
-  public synchronized void releaseNumaResource(ContainerId containerId) {
+  public synchronized void releaseNumaResource(ContainerId containerId)
+      throws ResourceHandlerException {
     LOG.info("Releasing the assigned NUMA resources for " + containerId);
     for (NumaNodeResource numaNode : numaNodesList) {
       numaNode.releaseResources(containerId);
     }
+    // delete from NM State store
+    try {
+      context.getNMStateStore().releaseAssignedResources(containerId, NUMA_RESOURCE_TYPE);
+    } catch (IOException e){
+      throw new ResourceHandlerException(e);
+    }
   }
 
   /**

+ 17 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMLeveldbStateStoreService.java

@@ -1813,4 +1813,21 @@ public class NMLeveldbStateStoreService extends NMStateStoreService {
             + getCurrentVersion() + ", but loading version " + loadedVersion);
     }
   }
+  @Override
+  public void releaseAssignedResources(ContainerId containerId, String resourceType)
+      throws IOException {
+    LOG.debug("releaseAssignedResources: containerId=" + containerId + " resourceType="
+        + resourceType);
+    try {
+      try (WriteBatch batch = db.createWriteBatch()) {
+        String key = CONTAINERS_KEY_PREFIX + containerId
+            + CONTAINER_ASSIGNED_RESOURCES_KEY_SUFFIX + resourceType;
+        batch.delete(bytes(key));
+        db.write(batch);
+      }
+    }catch (DBException e){
+      markStoreUnHealthy(e);
+      throw new IOException(e);
+    }
+  }
 }

+ 9 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMStateStoreService.java

@@ -786,6 +786,15 @@ public abstract class NMStateStoreService extends AbstractService {
       String resourceType, List<Serializable> assignedResources)
       throws IOException;
 
+  /**
+   * Delete the assigned resources of a container of specific resourceType.
+   * @param containerId Container Id
+   * @param resourceType resource Type
+   * @throws IOException while releasing resources
+   */
+  public void releaseAssignedResources(ContainerId containerId, String resourceType)
+      throws IOException {}
+
   protected abstract void initStorage(Configuration conf) throws IOException;
 
   protected abstract void startStorage() throws IOException;

+ 12 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/TestNMLeveldbStateStoreService.java

@@ -1756,6 +1756,18 @@ public class TestNMLeveldbStateStoreService {
     resources = rcs.getResourceMappings().getAssignedResources("numa");
     Assert.assertEquals(numaRes, resources);
     Assert.assertEquals(numaRes, resourceMappings.getAssignedResources("numa"));
+    // test removing numa resources from state store
+    stateStore.releaseAssignedResources(containerId, "numa");
+    recoveredContainers = loadContainersState(stateStore.getContainerStateIterator());
+    resourceMappings = recoveredContainers.get(0).getResourceMappings();
+    assertTrue(resourceMappings.getAssignedResources("numa").isEmpty());
+
+    // testing calling deletion of non-existing key doesn't break anything
+    try {
+      stateStore.releaseAssignedResources(containerId, "numa");
+    }catch (RuntimeException e){
+      Assert.fail("Should not throw exception while deleting non existing key from statestore");
+    }
   }
 
   @Test