Browse Source

HDFS-13603: do not propagate ExecutionException and add maxRetries limit to NameNode edek cache warmup (#6774)

Yu Zhang 10 tháng trước cách đây
mục cha
commit
b4ddb2d3bb

+ 3 - 0
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java

@@ -1422,6 +1422,9 @@ public class DFSConfigKeys extends CommonConfigurationKeys {
   public static final int DFS_NAMENODE_EDEKCACHELOADER_INTERVAL_MS_DEFAULT = 1000;
   public static final String DFS_NAMENODE_EDEKCACHELOADER_INITIAL_DELAY_MS_KEY = "dfs.namenode.edekcacheloader.initial.delay.ms";
   public static final int DFS_NAMENODE_EDEKCACHELOADER_INITIAL_DELAY_MS_DEFAULT = 3000;
+  public static final String DFS_NAMENODE_EDEKCACHELOADER_MAX_RETRIES_KEY =
+      "dfs.namenode.edekcacheloader.max-retries";
+  public static final int DFS_NAMENODE_EDEKCACHELOADER_MAX_RETRIES_DEFAULT = 10;
   public static final String DFS_NAMENODE_REENCRYPT_SLEEP_INTERVAL_KEY = "dfs.namenode.reencrypt.sleep.interval";
   public static final String DFS_NAMENODE_REENCRYPT_SLEEP_INTERVAL_DEFAULT = "1m";
   public static final String DFS_NAMENODE_REENCRYPT_BATCH_SIZE_KEY = "dfs.namenode.reencrypt.batch.size";

+ 25 - 25
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSDirEncryptionZoneOp.java

@@ -533,16 +533,16 @@ final class FSDirEncryptionZoneOp {
   }
 
   /**
-   * Proactively warm up the edek cache. We'll get all the edek key names,
-   * then launch up a separate thread to warm them up.
+   * Best-effort attempt to proactively warm up the edek cache. We'll get all the edek key names,
+   * then launch up a separate thread to warm them up. Retries happen if any of keys fail to warm up.
    */
   static void warmUpEdekCache(final ExecutorService executor,
-      final FSDirectory fsd, final int delay, final int interval) {
+      final FSDirectory fsd, final int delay, final int interval, final int maxRetries) {
     fsd.readLock();
     try {
       String[] edeks  = fsd.ezManager.getKeyNames();
       executor.execute(
-          new EDEKCacheLoader(edeks, fsd.getProvider(), delay, interval));
+          new EDEKCacheLoader(edeks, fsd.getProvider(), delay, interval, maxRetries));
     } finally {
       fsd.readUnlock();
     }
@@ -557,19 +557,22 @@ final class FSDirEncryptionZoneOp {
     private final KeyProviderCryptoExtension kp;
     private int initialDelay;
     private int retryInterval;
+    private int maxRetries;
 
     EDEKCacheLoader(final String[] names, final KeyProviderCryptoExtension kp,
-        final int delay, final int interval) {
+        final int delay, final int interval, final int maxRetries) {
       this.keyNames = names;
       this.kp = kp;
       this.initialDelay = delay;
       this.retryInterval = interval;
+      this.maxRetries = maxRetries;
     }
 
     @Override
     public void run() {
       NameNode.LOG.info("Warming up {} EDEKs... (initialDelay={}, "
-          + "retryInterval={})", keyNames.length, initialDelay, retryInterval);
+              + "retryInterval={}, maxRetries={})", keyNames.length, initialDelay, retryInterval,
+          maxRetries);
       try {
         Thread.sleep(initialDelay);
       } catch (InterruptedException ie) {
@@ -577,42 +580,39 @@ final class FSDirEncryptionZoneOp {
         return;
       }
 
-      final int logCoolDown = 10000; // periodically print error log (if any)
-      int sinceLastLog = logCoolDown; // always print the first failure
       boolean success = false;
+      int retryCount = 0;
       IOException lastSeenIOE = null;
       long warmUpEDEKStartTime = monotonicNow();
-      while (true) {
+
+      while (!success && retryCount < maxRetries) {
         try {
           kp.warmUpEncryptedKeys(keyNames);
-          NameNode.LOG
-              .info("Successfully warmed up {} EDEKs.", keyNames.length);
+          NameNode.LOG.info("Successfully warmed up {} EDEKs.", keyNames.length);
           success = true;
-          break;
         } catch (IOException ioe) {
           lastSeenIOE = ioe;
-          if (sinceLastLog >= logCoolDown) {
-            NameNode.LOG.info("Failed to warm up EDEKs.", ioe);
-            sinceLastLog = 0;
-          } else {
-            NameNode.LOG.debug("Failed to warm up EDEKs.", ioe);
-          }
+          NameNode.LOG.info("Failed to warm up EDEKs.", ioe);
         } catch (Exception e) {
           NameNode.LOG.error("Cannot warm up EDEKs.", e);
           throw e;
         }
-        try {
-          Thread.sleep(retryInterval);
-        } catch (InterruptedException ie) {
-          NameNode.LOG.info("EDEKCacheLoader interrupted during retry.");
-          break;
+
+        if (!success) {
+          try {
+            Thread.sleep(retryInterval);
+          } catch (InterruptedException ie) {
+            NameNode.LOG.info("EDEKCacheLoader interrupted during retry.");
+            break;
+          }
+          retryCount++;
         }
-        sinceLastLog += retryInterval;
       }
+
       long warmUpEDEKTime = monotonicNow() - warmUpEDEKStartTime;
       NameNode.getNameNodeMetrics().addWarmUpEDEKTime(warmUpEDEKTime);
       if (!success) {
-        NameNode.LOG.warn("Unable to warm up EDEKs.");
+        NameNode.LOG.warn("Max retry {} reached, unable to warm up EDEKs.", maxRetries);
         if (lastSeenIOE != null) {
           NameNode.LOG.warn("Last seen exception:", lastSeenIOE);
         }

+ 7 - 2
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java

@@ -579,6 +579,7 @@ public class FSNamesystem implements Namesystem, FSNamesystemMBean,
   private ExecutorService edekCacheLoader = null;
   private final int edekCacheLoaderDelay;
   private final int edekCacheLoaderInterval;
+  private final int edekCacheLoaderMaxRetries;
 
   /**
    * When an active namenode will roll its own edit log, in # edits
@@ -1012,6 +1013,9 @@ public class FSNamesystem implements Namesystem, FSNamesystemMBean,
       this.edekCacheLoaderInterval = conf.getInt(
           DFSConfigKeys.DFS_NAMENODE_EDEKCACHELOADER_INTERVAL_MS_KEY,
           DFSConfigKeys.DFS_NAMENODE_EDEKCACHELOADER_INTERVAL_MS_DEFAULT);
+      this.edekCacheLoaderMaxRetries = conf.getInt(
+          DFSConfigKeys.DFS_NAMENODE_EDEKCACHELOADER_MAX_RETRIES_KEY,
+          DFSConfigKeys.DFS_NAMENODE_EDEKCACHELOADER_MAX_RETRIES_DEFAULT);
 
       this.leaseRecheckIntervalMs = conf.getLong(
           DFS_NAMENODE_LEASE_RECHECK_INTERVAL_MS_KEY,
@@ -1470,8 +1474,9 @@ public class FSNamesystem implements Namesystem, FSNamesystemMBean,
             new ThreadFactoryBuilder().setDaemon(true)
                 .setNameFormat("Warm Up EDEK Cache Thread #%d")
                 .build());
-        FSDirEncryptionZoneOp.warmUpEdekCache(edekCacheLoader, dir,
-            edekCacheLoaderDelay, edekCacheLoaderInterval);
+        FSDirEncryptionZoneOp
+            .warmUpEdekCache(edekCacheLoader, dir, edekCacheLoaderDelay, edekCacheLoaderInterval,
+                edekCacheLoaderMaxRetries);
       }
       if (blockManager.getSPSManager() != null) {
         blockManager.getSPSManager().start();

+ 8 - 0
hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml

@@ -3614,6 +3614,14 @@
   </description>
 </property>
 
+<property>
+  <name>dfs.namenode.edekcacheloader.max-retries</name>
+  <value>10</value>
+  <description>When KeyProvider is configured, the max retries allowed to attempt
+    warm up edek cache if none of key successful on NN start up / become active.
+  </description>
+</property>
+
 <property>
   <name>dfs.namenode.reencrypt.sleep.interval</name>
   <value>1m</value>

+ 59 - 0
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestFSDirEncryptionZoneOp.java

@@ -0,0 +1,59 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs.server.namenode;
+
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.crypto.key.KeyProviderCryptoExtension;
+import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.NamenodeRole;
+
+import org.junit.Test;
+
+import static org.mockito.ArgumentMatchers.any;
+import static org.mockito.Mockito.doThrow;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.times;
+import static org.mockito.Mockito.verify;
+
+public class TestFSDirEncryptionZoneOp {
+
+  @Test
+  public void testWarmUpEdekCacheRetries() throws IOException {
+    NameNode.initMetrics(new Configuration(), NamenodeRole.NAMENODE);
+
+    final int initialDelay = 100;
+    final int retryInterval = 100;
+    final int maxRetries = 2;
+
+    KeyProviderCryptoExtension kpMock = mock(KeyProviderCryptoExtension.class);
+
+    doThrow(new IOException())
+        .doThrow(new IOException())
+        .doAnswer(invocation -> null)
+        .when(kpMock).warmUpEncryptedKeys(any());
+
+    FSDirEncryptionZoneOp.EDEKCacheLoader loader =
+        new FSDirEncryptionZoneOp.EDEKCacheLoader(new String[] {"edek1", "edek2"}, kpMock,
+            initialDelay, retryInterval, maxRetries);
+
+    loader.run();
+
+    verify(kpMock, times(maxRetries)).warmUpEncryptedKeys(any());
+  }
+}