Quellcode durchsuchen

HDFS-10341. Add a metric to expose the timeout number of pending replication blocks. (Contributed by Akira Ajisaka)

Arpit Agarwal vor 9 Jahren
Ursprung
Commit
97e2449477

+ 1 - 0
hadoop-common-project/hadoop-common/src/site/markdown/Metrics.md

@@ -218,6 +218,7 @@ Each metrics record contains tags such as HAState and Hostname as additional inf
 | `TotalSyncCount` | Total number of sync operations performed by edit log |
 | `TotalSyncTimes` | Total number of milliseconds spent by various edit logs in sync operation|
 | `NameDirSize` | NameNode name directories size in bytes |
+| `NumTimedOutPendingReconstructions` | The number of timed out reconstructions. Not the number of unique blocks that timed out. |
 
 JournalNode
 -----------

+ 4 - 0
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java

@@ -198,6 +198,10 @@ public class BlockManager implements BlockStatsMXBean {
   public int getPendingDataNodeMessageCount() {
     return pendingDNMessages.count();
   }
+  /** Used by metrics. */
+  public long getNumTimedOutPendingReconstructions() {
+    return pendingReconstruction.getNumTimedOuts();
+  }
 
   /**replicationRecheckInterval is how often namenode checks for new replication work*/
   private final long replicationRecheckInterval;

+ 15 - 1
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/PendingReconstructionBlocks.java

@@ -50,6 +50,7 @@ class PendingReconstructionBlocks {
   private final ArrayList<BlockInfo> timedOutItems;
   Daemon timerThread = null;
   private volatile boolean fsRunning = true;
+  private long timedOutCount = 0L;
 
   //
   // It might take anywhere between 5 to 10 minutes before
@@ -125,6 +126,7 @@ class PendingReconstructionBlocks {
     synchronized (pendingReconstructions) {
       pendingReconstructions.clear();
       timedOutItems.clear();
+      timedOutCount = 0L;
     }
   }
 
@@ -148,6 +150,16 @@ class PendingReconstructionBlocks {
     return 0;
   }
 
+  /**
+   * Used for metrics.
+   * @return The number of timeouts
+   */
+  long getNumTimedOuts() {
+    synchronized (timedOutItems) {
+      return timedOutCount + timedOutItems.size();
+    }
+  }
+
   /**
    * Returns a list of blocks that have timed out their
    * reconstruction requests. Returns null if no blocks have
@@ -158,9 +170,11 @@ class PendingReconstructionBlocks {
       if (timedOutItems.size() <= 0) {
         return null;
       }
+      int size = timedOutItems.size();
       BlockInfo[] blockList = timedOutItems.toArray(
-          new BlockInfo[timedOutItems.size()]);
+          new BlockInfo[size]);
       timedOutItems.clear();
+      timedOutCount += size;
       return blockList;
     }
   }

+ 5 - 0
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java

@@ -4507,6 +4507,11 @@ public class FSNamesystem implements Namesystem, FSNamesystemMBean,
   public long getExcessBlocks() {
     return blockManager.getExcessBlocksCount();
   }
+
+  @Metric
+  public long getNumTimedOutPendingReconstructions() {
+    return blockManager.getNumTimedOutPendingReconstructions();
+  }
   
   // HA-only metric
   @Metric

+ 14 - 6
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestPendingReconstruction.java

@@ -18,6 +18,8 @@
 package org.apache.hadoop.hdfs.server.blockmanagement;
 
 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertNull;
 import static org.junit.Assert.assertTrue;
 
 import java.util.ArrayList;
@@ -117,14 +119,15 @@ public class TestPendingReconstruction {
     //
     // verify that nothing has timed out so far
     //
-    assertTrue(pendingReconstructions.getTimedOutBlocks() == null);
+    assertNull(pendingReconstructions.getTimedOutBlocks());
+    assertEquals(0L, pendingReconstructions.getNumTimedOuts());
 
     //
     // Wait for one second and then insert some more items.
     //
     try {
       Thread.sleep(1000);
-    } catch (Exception e) {
+    } catch (Exception ignored) {
     }
 
     for (int i = 10; i < 15; i++) {
@@ -133,7 +136,8 @@ public class TestPendingReconstruction {
           DatanodeStorageInfo.toDatanodeDescriptors(
               DFSTestUtil.createDatanodeStorageInfos(i)));
     }
-    assertTrue(pendingReconstructions.size() == 15);
+    assertEquals(15, pendingReconstructions.size());
+    assertEquals(0L, pendingReconstructions.getNumTimedOuts());
 
     //
     // Wait for everything to timeout.
@@ -153,10 +157,14 @@ public class TestPendingReconstruction {
     // Verify that everything has timed out.
     //
     assertEquals("Size of pendingReconstructions ", 0, pendingReconstructions.size());
+    assertEquals(15L, pendingReconstructions.getNumTimedOuts());
     Block[] timedOut = pendingReconstructions.getTimedOutBlocks();
-    assertTrue(timedOut != null && timedOut.length == 15);
-    for (int i = 0; i < timedOut.length; i++) {
-      assertTrue(timedOut[i].getBlockId() < 15);
+    assertNotNull(timedOut);
+    assertEquals(15, timedOut.length);
+    // Verify the number is not reset
+    assertEquals(15L, pendingReconstructions.getNumTimedOuts());
+    for (Block block : timedOut) {
+      assertTrue(block.getBlockId() < 15);
     }
     pendingReconstructions.stop();
   }