Browse Source

YARN-11463. Node Labels root directory creation doesn't have a retry logic (#5562)

Co-authored-by: Ashutosh Gupta <ashugpt@amazon.com>
Ashutosh Gupta 2 years ago
parent
commit
964c1902c8

+ 10 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java

@@ -217,6 +217,16 @@ public class YarnConfiguration extends Configuration {
 
 
   public static final int DEFAULT_RM_APPLICATION_MAX_TAG_LENGTH = 100;
   public static final int DEFAULT_RM_APPLICATION_MAX_TAG_LENGTH = 100;
 
 
+  public static final String NODE_STORE_ROOT_DIR_NUM_RETRIES =
+      RM_PREFIX + "nodestore-rootdir.num-retries";
+
+  public static final int NODE_STORE_ROOT_DIR_NUM_DEFAULT_RETRIES = 1000;
+
+  public static final String NODE_STORE_ROOT_DIR_RETRY_INTERVAL =
+      RM_PREFIX + "nodestore-rootdir.retry-interval-ms";
+
+  public static final int NODE_STORE_ROOT_DIR_RETRY_DEFAULT_INTERVAL = 1000;
+
   public static final String RM_APPLICATION_MASTER_SERVICE_PROCESSORS =
   public static final String RM_APPLICATION_MASTER_SERVICE_PROCESSORS =
       RM_PREFIX + "application-master-service.processors";
       RM_PREFIX + "application-master-service.processors";
 
 

+ 26 - 2
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/nodelabels/store/AbstractFSNodeStore.java

@@ -65,8 +65,32 @@ public abstract class AbstractFSNodeStore<M> {
     this.fsWorkingPath = fsStorePath;
     this.fsWorkingPath = fsStorePath;
     this.manager = mgr;
     this.manager = mgr;
     initFileSystem(conf);
     initFileSystem(conf);
-    // mkdir of root dir path
-    fs.mkdirs(fsWorkingPath);
+    // mkdir of root dir path with retry logic
+    int maxRetries = conf.getInt(YarnConfiguration.NODE_STORE_ROOT_DIR_NUM_RETRIES,
+        YarnConfiguration.NODE_STORE_ROOT_DIR_NUM_DEFAULT_RETRIES);
+    int retryCount = 0;
+    boolean success = fs.mkdirs(fsWorkingPath);
+
+    while (!success && retryCount < maxRetries) {
+      try {
+        if (!fs.exists(fsWorkingPath)) {
+          success = fs.mkdirs(fsWorkingPath);
+        } else {
+          success = true;
+        }
+      } catch (IOException e) {
+        retryCount++;
+        if (retryCount >= maxRetries) {
+          throw e;
+        }
+        try {
+          Thread.sleep(conf.getInt(YarnConfiguration.NODE_STORE_ROOT_DIR_RETRY_INTERVAL,
+              YarnConfiguration.NODE_STORE_ROOT_DIR_RETRY_DEFAULT_INTERVAL));
+        } catch (InterruptedException ie) {
+          throw new RuntimeException(ie);
+        }
+      }
+    }
     this.replication = conf.getInt(YarnConfiguration.FS_STORE_FILE_REPLICATION,
     this.replication = conf.getInt(YarnConfiguration.FS_STORE_FILE_REPLICATION,
         YarnConfiguration.DEFAULT_FS_STORE_FILE_REPLICATION);
         YarnConfiguration.DEFAULT_FS_STORE_FILE_REPLICATION);
     LOG.info("Created store directory :" + fsWorkingPath);
     LOG.info("Created store directory :" + fsWorkingPath);

+ 16 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml

@@ -5177,4 +5177,20 @@
     <value>1</value>
     <value>1</value>
   </property>
   </property>
 
 
+  <property>
+    <description>
+      Number of Retries while trying to make root directory for node store.
+    </description>
+    <name>yarn.resourcemanager.nodestore-rootdir.num-retries</name>
+    <value>1000</value>
+  </property>
+
+  <property>
+    <description>
+      Interval in ms between retries while trying to make root directory for node store.
+    </description>
+    <name>yarn.resourcemanager.nodestore-rootdir.retry-interval-ms</name>
+    <value>1000</value>
+  </property>
+
 </configuration>
 </configuration>

+ 0 - 3
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/test/java/org/apache/hadoop/yarn/nodelabels/TestFileSystemNodeLabelsStore.java

@@ -359,9 +359,6 @@ public class TestFileSystemNodeLabelsStore extends NodeLabelTestBase {
 
 
     mockStore.setFs(mockFs);
     mockStore.setFs(mockFs);
     verifyMkdirsCount(mockStore, true, 1);
     verifyMkdirsCount(mockStore, true, 1);
-    verifyMkdirsCount(mockStore, false, 2);
-    verifyMkdirsCount(mockStore, true, 3);
-    verifyMkdirsCount(mockStore, false, 4);
   }
   }
 
 
   private void verifyMkdirsCount(FileSystemNodeLabelsStore store,
   private void verifyMkdirsCount(FileSystemNodeLabelsStore store,