Browse Source

HDFS-16841. Enhance the function of DebugAdmin#VerifyECCommand (#5137)

huhaiyang 2 years ago
parent
commit
ef84d21867

+ 40 - 17
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/DebugAdmin.java

@@ -432,8 +432,13 @@ public class DebugAdmin extends Configured implements Tool {
 
     VerifyECCommand() {
       super("verifyEC",
-          "verifyEC -file <file>",
-          "  Verify HDFS erasure coding on all block groups of the file.");
+          "verifyEC -file <file> [-blockId <blk_Id>] [-skipFailureBlocks]",
+          "  -file Verify HDFS erasure coding on all block groups of the file." +
+              System.lineSeparator() +
+          "  -skipFailureBlocks specify will skip any block group failures during verify," +
+          "  and continues verify all block groups of the file," + System.lineSeparator() +
+          "  the default is not to skip failure blocks." + System.lineSeparator() +
+          "  -blockId specify blk_Id to verify for a specific one block group.");
     }
 
     int run(List<String> args) throws IOException {
@@ -480,30 +485,48 @@ public class DebugAdmin extends Configured implements Tool {
       this.parityBlkNum = ecPolicy.getNumParityUnits();
       this.cellSize = ecPolicy.getCellSize();
       this.encoder = CodecUtil.createRawEncoder(getConf(), ecPolicy.getCodecName(),
-          new ErasureCoderOptions(
-              ecPolicy.getNumDataUnits(), ecPolicy.getNumParityUnits()));
+          new ErasureCoderOptions(dataBlkNum, parityBlkNum));
       int blockNum = dataBlkNum + parityBlkNum;
       this.readService = new ExecutorCompletionService<>(
           DFSUtilClient.getThreadPoolExecutor(blockNum, blockNum, 60,
               new LinkedBlockingQueue<>(), "read-", false));
-      this.blockReaders = new BlockReader[dataBlkNum + parityBlkNum];
+      this.blockReaders = new BlockReader[blockNum];
+
+      String needToVerifyBlockId = StringUtils.popOptionWithArgument("-blockId", args);
+      boolean skipFailureBlocks = StringUtils.popOption("-skipFailureBlocks", args);
+      boolean isHealthy = true;
 
       for (LocatedBlock locatedBlock : locatedBlocks.getLocatedBlocks()) {
-        System.out.println("Checking EC block group: blk_" + locatedBlock.getBlock().getBlockId());
-        LocatedStripedBlock blockGroup = (LocatedStripedBlock) locatedBlock;
+        String blockName = locatedBlock.getBlock().getBlockName();
+        if (needToVerifyBlockId == null || needToVerifyBlockId.equals(blockName)) {
+          System.out.println("Checking EC block group: " + blockName);
+          LocatedStripedBlock blockGroup = (LocatedStripedBlock) locatedBlock;
 
-        try {
-          verifyBlockGroup(blockGroup);
-          System.out.println("Status: OK");
-        } catch (Exception e) {
-          System.err.println("Status: ERROR, message: " + e.getMessage());
-          return 1;
-        } finally {
-          closeBlockReaders();
+          try {
+            verifyBlockGroup(blockGroup);
+            System.out.println("Status: OK");
+          } catch (Exception e) {
+            System.err.println("Status: ERROR, message: " + e.getMessage());
+            isHealthy = false;
+            if (!skipFailureBlocks) {
+              break;
+            }
+          } finally {
+            closeBlockReaders();
+          }
+
+          if (needToVerifyBlockId != null) {
+            break;
+          }
         }
       }
-      System.out.println("\nAll EC block group status: OK");
-      return 0;
+      if (isHealthy) {
+        if (needToVerifyBlockId == null) {
+          System.out.println("\nAll EC block group status: OK");
+        }
+        return 0;
+      }
+      return 1;
     }
 
     private void verifyBlockGroup(LocatedStripedBlock blockGroup) throws Exception {

+ 42 - 2
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/tools/TestDebugAdmin.java

@@ -194,8 +194,13 @@ public class TestDebugAdmin {
     cluster.waitActive();
     DistributedFileSystem fs = cluster.getFileSystem();
 
-    assertEquals("ret: 1, verifyEC -file <file>  Verify HDFS erasure coding on " +
-        "all block groups of the file.", runCmd(new String[]{"verifyEC"}));
+    assertEquals("ret: 1, verifyEC -file <file> [-blockId <blk_Id>] " +
+        "[-skipFailureBlocks]  -file Verify HDFS erasure coding on all block groups of the file." +
+        "  -skipFailureBlocks specify will skip any block group failures during verify," +
+        "  and continues verify all block groups of the file," +
+        "  the default is not to skip failure blocks." +
+        "  -blockId specify blk_Id to verify for a specific one block group.",
+        runCmd(new String[]{"verifyEC"}));
 
     assertEquals("ret: 1, File /bar does not exist.",
         runCmd(new String[]{"verifyEC", "-file", "/bar"}));
@@ -270,6 +275,41 @@ public class TestDebugAdmin {
         "-out", metaFile.getAbsolutePath()});
     assertTrue(runCmd(new String[]{"verifyEC", "-file", "/ec/foo_corrupt"})
         .contains("Status: ERROR, message: EC compute result not match."));
+
+    // Specify -blockId.
+    Path newFile = new Path(ecDir, "foo_new");
+    DFSTestUtil.createFile(fs, newFile, (int) k, 6 * m, m, repl, seed);
+    blocks = DFSTestUtil.getAllBlocks(fs, newFile);
+    assertEquals(2, blocks.size());
+    blockGroup = (LocatedStripedBlock) blocks.get(0);
+    String blockName = blockGroup.getBlock().getBlockName();
+    assertTrue(runCmd(new String[]{"verifyEC", "-file", "/ec/foo_new", "-blockId", blockName})
+        .contains("ret: 0, Checking EC block group: " + blockName + "Status: OK"));
+
+    // Specify -verifyAllFailures.
+    indexedBlocks = StripedBlockUtil.parseStripedBlockGroup(blockGroup,
+        ecPolicy.getCellSize(), ecPolicy.getNumDataUnits(), ecPolicy.getNumParityUnits());
+    // Try corrupt block 0 in block group.
+    toCorruptLocatedBlock = indexedBlocks[0];
+    toCorruptBlock = toCorruptLocatedBlock.getBlock();
+    datanode = cluster.getDataNode(toCorruptLocatedBlock.getLocations()[0].getIpcPort());
+    blockFile = getBlockFile(datanode.getFSDataset(),
+        toCorruptBlock.getBlockPoolId(), toCorruptBlock.getLocalBlock());
+    metaFile = getMetaFile(datanode.getFSDataset(),
+        toCorruptBlock.getBlockPoolId(), toCorruptBlock.getLocalBlock());
+    metaFile.delete();
+    // Write error bytes to block file and re-generate meta checksum.
+    errorBytes = new byte[1048576];
+    new Random(0x12345678L).nextBytes(errorBytes);
+    FileUtils.writeByteArrayToFile(blockFile, errorBytes);
+    runCmd(new String[]{"computeMeta", "-block", blockFile.getAbsolutePath(),
+        "-out", metaFile.getAbsolutePath()});
+    // VerifyEC and set skipFailureBlocks.
+    LocatedStripedBlock blockGroup2 = (LocatedStripedBlock) blocks.get(1);
+    assertTrue(runCmd(new String[]{"verifyEC", "-file", "/ec/foo_new", "-skipFailureBlocks"})
+        .contains("ret: 1, Checking EC block group: " + blockGroup.getBlock().getBlockName() +
+            "Status: ERROR, message: EC compute result not match." +
+            "Checking EC block group: " + blockGroup2.getBlock().getBlockName() + "Status: OK"));
   }
 
 }