Parcourir la source

HDFS-4699. TestPipelinesFailover#testPipelineRecoveryStress fails sporadically. Contributed by Chris Nauroth.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1469839 13f79535-47bb-0310-9956-ffa450edef68
Kihwal Lee il y a 12 ans
Parent
commit
16cc4a6e86

+ 3 - 0
hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt

@@ -2560,6 +2560,9 @@ Release 0.23.8 - UNRELEASED
 
     HDFS-4477. Secondary namenode may retain old tokens (daryn via kihwal)
 
+    HDFS-4699. TestPipelinesFailover#testPipelineRecoveryStress fails
+    sporadically (Chris Nauroth via kihwal)
+
 Release 0.23.7 - UNRELEASED
 
   INCOMPATIBLE CHANGES

+ 4 - 1
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNode.java

@@ -1286,7 +1286,10 @@ public class DataNode extends Configured
     LOG.warn("checkDiskError: exception: ", e);
     if (e instanceof SocketException || e instanceof SocketTimeoutException
     	  || e instanceof ClosedByInterruptException 
-    	  || e.getMessage().startsWith("Broken pipe")) {
+    	  || e.getMessage().startsWith("An established connection was aborted")
+    	  || e.getMessage().startsWith("Broken pipe")
+    	  || e.getMessage().startsWith("Connection reset")
+    	  || e.getMessage().contains("java.nio.channels.SocketChannel")) {
       LOG.info("Not checking disk as checkDiskError was called on a network" +
       		" related exception");	
       return;

+ 8 - 4
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestPipelinesFailover.java

@@ -422,6 +422,11 @@ public class TestPipelinesFailover {
     // Disable permissions so that another user can recover the lease.
     harness.conf.setBoolean(
         DFSConfigKeys.DFS_PERMISSIONS_ENABLED_KEY, false);
+    // This test triggers rapid NN failovers.  The client retry policy uses an
+    // exponential backoff.  This can quickly lead to long sleep times and even
+    // timeout the whole test.  Cap the sleep time at 1s to prevent this.
+    harness.conf.setInt(DFSConfigKeys.DFS_CLIENT_FAILOVER_SLEEPTIME_MAX_KEY,
+      1000);
 
     final MiniDFSCluster cluster = harness.startCluster();
     try {
@@ -537,11 +542,10 @@ public class TestPipelinesFailover {
   }
   
   /**
-   * Try to cover the lease on the given file for up to 30
-   * seconds.
+   * Try to recover the lease on the given file for up to 60 seconds.
    * @param fsOtherUser the filesystem to use for the recoverLease call
    * @param testPath the path on which to run lease recovery
-   * @throws TimeoutException if lease recover does not succeed within 30
+   * @throws TimeoutException if lease recover does not succeed within 60
    * seconds
    * @throws InterruptedException if the thread is interrupted
    */
@@ -564,7 +568,7 @@ public class TestPipelinesFailover {
           }
           return success;
         }
-      }, 1000, 30000);
+      }, 1000, 60000);
     } catch (TimeoutException e) {
       throw new TimeoutException("Timed out recovering lease for " +
           testPath);