浏览代码

HDFS-7916. 'reportBadBlocks' from datanodes to standby Node BPServiceActor goes for infinite loop. Contributed by Rushabh Shah.

Kihwal Lee 10 年之前
父节点
当前提交
ea11590aad

+ 3 - 0
hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt

@@ -814,6 +814,9 @@ Release 2.7.1 - UNRELEASED
     HDFS-8254. Standby namenode doesn't process DELETED_BLOCK if the add block
     request is in edit log. (Rushabh S Shah via kihwal)
 
+    HDFS-7916. 'reportBadBlocks' from datanodes to standby Node BPServiceActor
+    goes for infinite loop (Rushabh S Shah  via kihwal)
+
 Release 2.7.0 - 2015-04-20
 
   INCOMPATIBLE CHANGES

+ 4 - 0
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/ErrorReportAction.java

@@ -22,6 +22,7 @@ import java.io.IOException;
 
 import org.apache.hadoop.hdfs.protocolPB.DatanodeProtocolClientSideTranslatorPB;
 import org.apache.hadoop.hdfs.server.protocol.DatanodeRegistration;
+import org.apache.hadoop.ipc.RemoteException;
 
 
 /**
@@ -43,6 +44,9 @@ public class ErrorReportAction implements BPServiceActorAction {
     DatanodeRegistration bpRegistration) throws BPServiceActorActionException {
     try {
       bpNamenode.errorReport(bpRegistration, errorCode, errorMessage);
+    } catch (RemoteException re) {
+      DataNode.LOG.info("trySendErrorReport encountered RemoteException  "
+          + "errorMessage: " + errorMessage + "  errorCode: " + errorCode, re);
     } catch(IOException e) {
       throw new BPServiceActorActionException("Error reporting "
           + "an error to namenode: ");

+ 4 - 0
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/ReportBadBlockAction.java

@@ -26,6 +26,7 @@ import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
 import org.apache.hadoop.hdfs.protocol.LocatedBlock;
 import org.apache.hadoop.hdfs.protocolPB.DatanodeProtocolClientSideTranslatorPB;
 import org.apache.hadoop.hdfs.server.protocol.DatanodeRegistration;
+import org.apache.hadoop.ipc.RemoteException;
 
 /**
  * ReportBadBlockAction is an instruction issued by {{BPOfferService}} to
@@ -59,6 +60,9 @@ public class ReportBadBlockAction implements BPServiceActorAction {
 
     try {
       bpNamenode.reportBadBlocks(locatedBlock);
+    } catch (RemoteException re) {
+      DataNode.LOG.info("reportBadBlock encountered RemoteException for "
+          + "block:  " + block , re);
     } catch (IOException e) {
       throw new BPServiceActorActionException("Failed to report bad block "
           + block + " to namenode: ");

+ 54 - 0
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestBPOfferService.java

@@ -55,6 +55,9 @@ import org.apache.hadoop.hdfs.server.protocol.StorageBlockReport;
 import org.apache.hadoop.hdfs.server.protocol.StorageReceivedDeletedBlocks;
 import org.apache.hadoop.hdfs.server.protocol.StorageReport;
 import org.apache.hadoop.hdfs.server.protocol.VolumeFailureSummary;
+import org.apache.hadoop.ipc.RemoteException;
+import org.apache.hadoop.ipc.StandbyException;
+import org.apache.hadoop.ipc.protobuf.RpcHeaderProtos.RpcResponseHeaderProto.RpcErrorCodeProto;
 import org.apache.hadoop.test.GenericTestUtils;
 import org.apache.hadoop.test.PathUtils;
 import org.apache.hadoop.util.Time;
@@ -621,4 +624,55 @@ public class TestBPOfferService {
       bpos.stop();
     }
   } 
+
+  /**
+   * This test case doesn't add the reportBadBlock request to
+   * {@link BPServiceActor#bpThreadEnqueue} when the Standby namenode throws
+   * {@link StandbyException}
+   * @throws Exception
+   */
+ @Test
+  public void testReportBadBlocksWhenNNThrowsStandbyException()
+      throws Exception {
+    BPOfferService bpos = setupBPOSForNNs(mockNN1, mockNN2);
+    bpos.start();
+    try {
+      waitForInitialization(bpos);
+      // Should start with neither NN as active.
+      assertNull(bpos.getActiveNN());
+      // Have NN1 claim active at txid 1
+      mockHaStatuses[0] = new NNHAStatusHeartbeat(HAServiceState.ACTIVE, 1);
+      bpos.triggerHeartbeatForTests();
+      // Now mockNN1 is acting like active namenode and mockNN2 as Standby
+      assertSame(mockNN1, bpos.getActiveNN());
+      // Return nothing when active Active Namenode calls reportBadBlocks
+      Mockito.doNothing().when(mockNN1).reportBadBlocks
+          (Mockito.any(LocatedBlock[].class));
+
+      RemoteException re = new RemoteException(StandbyException.class.
+          getName(), "Operation category WRITE is not supported in state "
+          + "standby", RpcErrorCodeProto.ERROR_APPLICATION);
+      // Return StandbyException wrapped in RemoteException when Standby NN
+      // calls reportBadBlocks
+      Mockito.doThrow(re).when(mockNN2).reportBadBlocks
+          (Mockito.any(LocatedBlock[].class));
+
+      bpos.reportBadBlocks(FAKE_BLOCK, mockFSDataset.getVolume(FAKE_BLOCK)
+          .getStorageID(), mockFSDataset.getVolume(FAKE_BLOCK)
+          .getStorageType());
+      // Send heartbeat so that the BpServiceActor can report bad block to
+      // namenode
+      bpos.triggerHeartbeatForTests();
+      Mockito.verify(mockNN2, Mockito.times(1))
+      .reportBadBlocks(Mockito.any(LocatedBlock[].class));
+
+      // Trigger another heartbeat, this will send reportBadBlock again if it
+      // is present in the queue.
+      bpos.triggerHeartbeatForTests();
+      Mockito.verify(mockNN2, Mockito.times(1))
+          .reportBadBlocks(Mockito.any(LocatedBlock[].class));
+    } finally {
+      bpos.stop();
+    }
+  }
 }