Browse Source

HDFS-15795. EC: Wrong checksum when reconstruction was failed by exception. Contributed by Yushi Hayasaka (#2657)

crossfire 4 years ago
parent
commit
18978f2e20

+ 5 - 2
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockChecksumHelper.java

@@ -480,8 +480,9 @@ final class BlockChecksumHelper {
         // Before populating the blockChecksum at this index, record the byte
         // offset where it will begin.
         blockChecksumPositions[idx] = blockChecksumBuf.getLength();
+        ExtendedBlock block = null;
         try {
-          ExtendedBlock block = getInternalBlock(numDataUnits, idx);
+          block = getInternalBlock(numDataUnits, idx);
 
           LiveBlockInfo liveBlkInfo = liveDns.get((byte) idx);
           if (liveBlkInfo == null) {
@@ -502,7 +503,9 @@ final class BlockChecksumHelper {
             break; // done with the computation, simply return.
           }
         } catch (IOException e) {
-          LOG.warn("Failed to get the checksum", e);
+          LOG.warn("Failed to get the checksum for block {} at index {} "
+              + "in blockGroup {}", block, idx, blockGroup, e);
+          throw e;
         }
       }
 

+ 6 - 0
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNodeFaultInjector.java

@@ -106,6 +106,12 @@ public class DataNodeFaultInjector {
    */
   public void stripedBlockReconstruction() throws IOException {}
 
+  /**
+   * Used as a hook to inject failure in erasure coding checksum reconstruction
+   * process.
+   */
+  public void stripedBlockChecksumReconstruction() throws IOException {}
+
   /**
    * Used as a hook to inject latency when read block
    * in erasure coding reconstruction process.

+ 3 - 1
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/erasurecode/StripedBlockChecksumReconstructor.java

@@ -23,6 +23,7 @@ import java.nio.ByteBuffer;
 import java.util.Arrays;
 
 import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.hdfs.server.datanode.DataNodeFaultInjector;
 import org.apache.hadoop.io.DataOutputBuffer;
 
 /**
@@ -75,6 +76,7 @@ public abstract class StripedBlockChecksumReconstructor
     prepareDigester();
     long maxTargetLength = getMaxTargetLength();
     while (requestedLen > 0 && getPositionInBlock() < maxTargetLength) {
+      DataNodeFaultInjector.get().stripedBlockChecksumReconstruction();
       long remaining = maxTargetLength - getPositionInBlock();
       final int toReconstructLen = (int) Math
           .min(getStripedReader().getBufferSize(), remaining);
@@ -225,4 +227,4 @@ public abstract class StripedBlockChecksumReconstructor
     getStripedReader().close();
     cleanup();
   }
-}
+}

+ 34 - 0
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestFileChecksum.java

@@ -28,6 +28,7 @@ import org.apache.hadoop.hdfs.protocol.ErasureCodingPolicy;
 import org.apache.hadoop.hdfs.protocol.LocatedBlock;
 import org.apache.hadoop.hdfs.protocol.LocatedBlocks;
 import org.apache.hadoop.hdfs.server.datanode.DataNode;
+import org.apache.hadoop.hdfs.server.datanode.DataNodeFaultInjector;
 import org.apache.hadoop.test.GenericTestUtils;
 import org.junit.After;
 import org.junit.Assert;
@@ -46,6 +47,8 @@ import java.io.IOException;
 import java.util.Random;
 
 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_BLOCK_ACCESS_TOKEN_ENABLE_KEY;
+import static org.mockito.Mockito.doThrow;
+import static org.mockito.Mockito.mock;
 
 /**
  * This test serves a prototype to demo the idea proposed so far. It creates two
@@ -517,6 +520,37 @@ public class TestFileChecksum {
         bytesPerCRC - 1);
   }
 
+  @Test(timeout = 90000)
+  public void testStripedFileChecksumWithReconstructFail()
+      throws Exception {
+    String stripedFile4 = ecDir + "/stripedFileChecksum4";
+    prepareTestFiles(fileSize, new String[] {stripedFile4});
+
+    // get checksum
+    FileChecksum fileChecksum = getFileChecksum(stripedFile4, -1, false);
+
+    DataNodeFaultInjector oldInjector = DataNodeFaultInjector.get();
+    DataNodeFaultInjector newInjector = mock(DataNodeFaultInjector.class);
+    doThrow(new IOException())
+        .doNothing()
+        .when(newInjector)
+        .stripedBlockChecksumReconstruction();
+    DataNodeFaultInjector.set(newInjector);
+
+    try {
+      // Get checksum again with reconstruction.
+      // If the reconstruction task fails, a client try to get checksum from
+      // another DN which has a block of the block group because of a failure of
+      // getting result.
+      FileChecksum fileChecksum1 = getFileChecksum(stripedFile4, -1, true);
+
+      Assert.assertEquals("checksum should be same", fileChecksum,
+          fileChecksum1);
+    } finally {
+      DataNodeFaultInjector.set(oldInjector);
+    }
+  }
+
   @Test(timeout = 90000)
   public void testMixedBytesPerChecksum() throws Exception {
     int fileLength = bytesPerCRC * 3;