Browse Source

Merge r1455389 through r1457712 from trunk.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/HDFS-2802@1457716 13f79535-47bb-0310-9956-ffa450edef68
Tsz-wo Sze 12 years ago
parent
commit
a5a66330a8
97 changed files with 1972 additions and 523 deletions
  1. 40 0
      dev-support/findHangingTest.sh
  2. 2 2
      dev-support/test-patch.sh
  3. 77 0
      hadoop-client/pom.xml
  4. 20 0
      hadoop-common-project/hadoop-annotations/pom.xml
  5. 37 3
      hadoop-common-project/hadoop-common/CHANGES.txt
  6. 4 0
      hadoop-common-project/hadoop-common/pom.xml
  7. 7 2
      hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/shell/CommandWithDestination.java
  8. 93 0
      hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/SignalLogger.java
  9. 8 0
      hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/StringUtils.java
  10. 1 1
      hadoop-common-project/hadoop-common/src/main/resources/core-default.xml
  11. 68 61
      hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ipc/TestRPC.java
  12. 1 1
      hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/net/TestNetUtils.java
  13. 13 0
      hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/test/GenericTestUtils.java
  14. 42 0
      hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/util/TestSignalLogger.java
  15. 1 1
      hadoop-dist/pom.xml
  16. 1 1
      hadoop-hdfs-project/hadoop-hdfs-httpfs/src/main/java/org/apache/hadoop/lib/server/Server.java
  17. 1 1
      hadoop-hdfs-project/hadoop-hdfs-httpfs/src/test/java/org/apache/hadoop/fs/http/client/BaseTestHttpFSWith.java
  18. 30 15
      hadoop-hdfs-project/hadoop-hdfs-httpfs/src/test/java/org/apache/hadoop/fs/http/client/TestHttpFSFileSystemLocalFileSystem.java
  19. 24 10
      hadoop-hdfs-project/hadoop-hdfs-httpfs/src/test/java/org/apache/hadoop/lib/server/TestServer.java
  20. 4 1
      hadoop-hdfs-project/hadoop-hdfs-httpfs/src/test/java/org/apache/hadoop/lib/servlet/TestHostnameFilter.java
  21. 1 1
      hadoop-hdfs-project/hadoop-hdfs-httpfs/src/test/java/org/apache/hadoop/test/TestDirHelper.java
  22. 2 1
      hadoop-hdfs-project/hadoop-hdfs-httpfs/src/test/java/org/apache/hadoop/test/TestHdfsHelper.java
  23. 44 0
      hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
  24. 3 0
      hadoop-hdfs-project/hadoop-hdfs/src/contrib/libwebhdfs/CMakeLists.txt
  25. 6 5
      hadoop-hdfs-project/hadoop-hdfs/src/contrib/libwebhdfs/src/test_libwebhdfs_ops.c
  26. 27 13
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/BlockReaderLocal.java
  27. 14 6
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSClient.java
  28. 10 6
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSInputStream.java
  29. 3 2
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DistributedFileSystem.java
  30. 2 14
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/qjournal/server/JournalNode.java
  31. 1 0
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/security/token/delegation/DelegationTokenSecretManager.java
  32. 3 1
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java
  33. 22 3
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNode.java
  34. 1 0
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/CheckpointFaultInjector.java
  35. 52 31
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSImage.java
  36. 2 2
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSImageFormat.java
  37. 6 2
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSImagePreTransactionalStorageInspector.java
  38. 3 1
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSImageStorageInspector.java
  39. 19 11
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSImageTransactionalStorageInspector.java
  40. 67 8
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java
  41. 0 6
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeRpcServer.java
  42. 4 0
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/Namesystem.java
  43. 11 3
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ha/HAContext.java
  44. 1 1
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/util/LightWeightGSet.java
  45. 43 11
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/web/WebHdfsFileSystem.java
  46. 64 0
      hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/LogVerificationAppender.java
  47. 10 1
      hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDFSUpgradeFromImage.java
  48. 3 1
      hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestHftpURLTimeouts.java
  49. 5 4
      hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestMiniDFSCluster.java
  50. 108 56
      hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestShortCircuitLocalRead.java
  51. 20 10
      hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/qjournal/server/TestJournalNode.java
  52. 1 1
      hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestNodeCount.java
  53. 2 24
      hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestReplicationPolicy.java
  54. 12 19
      hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataDirs.java
  55. 7 7
      hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/FSImageTestUtil.java
  56. 55 1
      hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestCheckpoint.java
  57. 1 1
      hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestFSImageStorageInspector.java
  58. 4 3
      hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestHostsFiles.java
  59. 9 4
      hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestProcessCorruptBlocks.java
  60. 21 21
      hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestSaveNamespace.java
  61. 83 32
      hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestStartup.java
  62. 51 0
      hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestStandbyCheckpoints.java
  63. 1 0
      hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestStandbyIsHot.java
  64. 11 0
      hadoop-mapreduce-project/CHANGES.txt
  65. 9 1
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapred/YarnChild.java
  66. 16 3
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/MRAppMaster.java
  67. 7 7
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/job/impl/JobImpl.java
  68. 13 2
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/job/impl/TaskAttemptImpl.java
  69. 1 1
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/webapp/CountersBlock.java
  70. 4 0
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/test/java/org/apache/hadoop/mapreduce/v2/app/MRApp.java
  71. 123 0
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/test/java/org/apache/hadoop/mapreduce/v2/app/TestRecovery.java
  72. 1 1
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/test/java/org/apache/hadoop/mapreduce/v2/app/job/impl/TestJobImpl.java
  73. 17 1
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/Task.java
  74. 20 0
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/JobSubmitter.java
  75. 12 1
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/security/TokenCache.java
  76. 9 18
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/task/reduce/Fetcher.java
  77. 1 1
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/task/reduce/Shuffle.java
  78. 49 0
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/task/reduce/TestFetcher.java
  79. 6 2
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/src/main/java/org/apache/hadoop/mapreduce/v2/hs/webapp/HsTasksBlock.java
  80. 4 0
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/src/main/java/org/apache/hadoop/mapreduce/v2/hs/webapp/HsTasksPage.java
  81. 16 7
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/src/test/java/org/apache/hadoop/mapreduce/v2/hs/MockHistoryJobs.java
  82. 7 5
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/src/test/java/org/apache/hadoop/mapreduce/v2/hs/webapp/TestHsWebServicesJobs.java
  83. 2 2
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/src/test/java/org/apache/hadoop/mapreduce/v2/hs/webapp/TestHsWebServicesJobsQuery.java
  84. 4 3
      hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/java/org/apache/hadoop/mapred/pipes/TestPipeApplication.java
  85. 2 2
      hadoop-maven-plugins/src/main/java/org/apache/hadoop/maven/plugin/protoc/ProtocMojo.java
  86. 1 10
      hadoop-project/pom.xml
  87. 3 3
      hadoop-tools/hadoop-gridmix/src/test/java/org/apache/hadoop/mapred/gridmix/TestGridmixSummary.java
  88. 11 0
      hadoop-yarn-project/CHANGES.txt
  89. 14 0
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
  90. 14 2
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/webapp/view/JQueryUI.java
  91. 14 0
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml
  92. 1 1
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java
  93. 75 7
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java
  94. 19 3
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/NavBlock.java
  95. 1 1
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/NodePage.java
  96. 97 0
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdater.java
  97. 110 27
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-web-proxy/src/test/java/org/apache/hadoop/yarn/server/webproxy/amfilter/TestAmFilter.java

+ 40 - 0
dev-support/findHangingTest.sh

@@ -0,0 +1,40 @@
+#!/bin/bash
+##
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+##
+# script to find hanging test from Jenkins build output
+# usage: ./findHangingTest.sh <url of Jenkins build console>
+#
+`curl -k -o jenkins.out "$1"`
+expecting=Running
+cat jenkins.out | while read line; do
+ if [[ "$line" =~ "Running org.apache.hadoop" ]]; then
+  if [[ "$expecting" =~ "Running" ]]; then 
+   expecting=Tests
+  else
+   echo "Hanging test: $prevLine"
+  fi
+ fi
+ if [[ "$line" =~ "Tests run" ]]; then
+  expecting=Running
+ fi
+ if [[ "$line" =~ "Forking command line" ]]; then
+  a=$line
+ else
+  prevLine=$line
+ fi
+done

+ 2 - 2
dev-support/test-patch.sh

@@ -436,8 +436,8 @@ checkJavadocWarnings () {
   echo ""
   echo "There appear to be $javadocWarnings javadoc warnings generated by the patched build."
 
-  #There are 6 warnings that are caused by things that are caused by using sun internal APIs.
-  OK_JAVADOC_WARNINGS=6;
+  #There are 11 warnings that are caused by things that are caused by using sun internal APIs.
+  OK_JAVADOC_WARNINGS=11;
   ### if current warnings greater than OK_JAVADOC_WARNINGS
   if [[ $javadocWarnings -ne $OK_JAVADOC_WARNINGS ]] ; then
     JIRA_COMMENT="$JIRA_COMMENT

+ 77 - 0
hadoop-client/pom.xml

@@ -115,6 +115,14 @@
           <groupId>net.java.dev.jets3t</groupId>
           <artifactId>jets3t</artifactId>
         </exclusion>
+        <exclusion>
+          <groupId>com.jcraft</groupId>
+          <artifactId>jsch</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>commons-el</groupId>
+          <artifactId>commons-el</artifactId>
+        </exclusion>
       </exclusions>
     </dependency>
 
@@ -131,6 +139,34 @@
           <groupId>org.apache.avro</groupId>
           <artifactId>avro</artifactId>
         </exclusion>
+        <exclusion>
+          <groupId>org.mortbay.jetty</groupId>
+          <artifactId>jetty</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.mortbay.jetty</groupId>
+          <artifactId>jetty-util</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>com.sun.jersey</groupId>
+          <artifactId>jersey-core</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>com.sun.jersey</groupId>
+          <artifactId>jersey-server</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>javax.servlet</groupId>
+          <artifactId>servlet-api</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>javax.servlet.jsp</groupId>
+          <artifactId>jsp-api</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>tomcat</groupId>
+          <artifactId>jasper-runtime</artifactId>
+        </exclusion>
       </exclusions>
     </dependency>
 
@@ -171,6 +207,10 @@
           <groupId>jline</groupId>
           <artifactId>jline</artifactId>
         </exclusion>
+        <exclusion>
+          <groupId>io.netty</groupId>
+          <artifactId>netty</artifactId>
+        </exclusion>
       </exclusions>
     </dependency>
 
@@ -207,6 +247,18 @@
           <groupId>org.apache.avro</groupId>
           <artifactId>avro</artifactId>
         </exclusion>
+        <exclusion>
+          <groupId>com.sun.jersey</groupId>
+          <artifactId>jersey-core</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>com.sun.jersey</groupId>
+          <artifactId>jersey-json</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>io.netty</groupId>
+          <artifactId>netty</artifactId>
+        </exclusion>
       </exclusions>
     </dependency>
 
@@ -247,6 +299,14 @@
           <groupId>com.google.inject.extensions</groupId>
           <artifactId>guice-servlet</artifactId>
         </exclusion>
+        <exclusion>
+          <groupId>com.sun.jersey</groupId>
+          <artifactId>jersey-json</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>io.netty</groupId>
+          <artifactId>netty</artifactId>
+        </exclusion>
       </exclusions>
     </dependency>
 
@@ -271,8 +331,25 @@
           <groupId>com.google.inject.extensions</groupId>
           <artifactId>guice-servlet</artifactId>
         </exclusion>
+        <exclusion>
+          <groupId>io.netty</groupId>
+          <artifactId>netty</artifactId>
+        </exclusion>
+      </exclusions>
+    </dependency>
+
+    <dependency>
+      <groupId>org.apache.hadoop</groupId>
+      <artifactId>hadoop-annotations</artifactId>
+      <scope>compile</scope>
+      <exclusions>
+        <exclusion>
+          <groupId>jdk.tools</groupId>
+          <artifactId>jdk.tools</artifactId>
+        </exclusion>
       </exclusions>
     </dependency>
+    
   </dependencies>
 
 </project>

+ 20 - 0
hadoop-common-project/hadoop-annotations/pom.xml

@@ -38,4 +38,24 @@
     </dependency>
   </dependencies>
 
+  <profiles>
+    <profile>
+      <id>os.linux</id>
+      <activation>
+        <os>
+          <family>!Mac</family>
+        </os>
+      </activation>
+      <dependencies>
+        <dependency>
+          <groupId>jdk.tools</groupId>
+          <artifactId>jdk.tools</artifactId>
+          <version>1.6</version>
+          <scope>system</scope>
+          <systemPath>${java.home}/../lib/tools.jar</systemPath>
+        </dependency>
+      </dependencies>
+    </profile>
+  </profiles>
+
 </project>

+ 37 - 3
hadoop-common-project/hadoop-common/CHANGES.txt

@@ -340,6 +340,19 @@ Trunk (Unreleased)
     HADOOP-9264. Port change to use Java untar API on Windows from 
     branch-1-win to trunk. (Chris Nauroth via suresh)
 
+    HADOOP-9393. TestRPC fails with JDK7. (Andrew Wang via atm)
+
+    HADOOP-9394. Port findHangingTest.sh from HBase to Hadoop. (Andrew Wang
+    via atm)
+
+    HADOOP-9099. NetUtils.normalizeHostName fails on domains where 
+    UnknownHost resolves to an IP address. (Ivan Mitic via suresh)
+
+    HADOOP-9397. Incremental dist tar build fails (Chris Nauroth via jlowe)
+
+    HADOOP-9405. TestGridmixSummary#testExecutionSummarizer is broken. (Andrew
+    Wang via atm)
+
   OPTIMIZATIONS
 
     HADOOP-7761. Improve the performance of raw comparisons. (todd)
@@ -468,6 +481,11 @@ Trunk (Unreleased)
     HADOOP-9364. PathData#expandAsGlob does not return correct results for
     absolute paths on Windows. (Ivan Mitic via suresh)
 
+    HADOOP-8973. DiskChecker cannot reliably detect an inaccessible disk on
+    Windows with NTFS ACLs. (Chris Nauroth via suresh)
+
+    HADOOP-9388. TestFsShellCopy fails on Windows. (Ivan Mitic via suresh)
+    
 Release 2.0.5-beta - UNRELEASED
 
   INCOMPATIBLE CHANGES
@@ -493,6 +511,9 @@ Release 2.0.5-beta - UNRELEASED
 
     HADOOP-9343. Allow additional exceptions through the RPC layer. (sseth)
 
+    HADOOP-9318. When exiting on a signal, print the signal name first. (Colin
+    Patrick McCabe via atm)
+
   OPTIMIZATIONS
 
   BUG FIXES
@@ -545,6 +566,11 @@ Release 2.0.5-beta - UNRELEASED
     HADOOP-9379. capture the ulimit info after printing the log to the 
     console. (Arpit Gupta via suresh)
 
+    HADOOP-9399. protoc maven plugin doesn't work on mvn 3.0.2 (todd)
+
+    HADOOP-9407. commons-daemon 1.0.3 dependency has bad group id causing
+    build issues. (Sangjin Lee via suresh)
+
 Release 2.0.4-alpha - UNRELEASED
 
   INCOMPATIBLE CHANGES
@@ -557,6 +583,14 @@ Release 2.0.4-alpha - UNRELEASED
 
   BUG FIXES
 
+    HADOOP-9406. hadoop-client leaks dependency on JDK tools jar. (tucu)
+
+    HADOOP-9301. hadoop client servlet/jsp/jetty/tomcat JARs creating
+    conflicts in Oozie & HttpFS. (tucu)
+
+    HADOOP-9408. misleading description for net.topology.table.file.name
+    property in core-default.xml. (rajeshbabu via suresh)
+
 Release 2.0.3-alpha - 2013-02-06 
 
   INCOMPATIBLE CHANGES
@@ -2798,6 +2832,9 @@ Release 0.23.0 - 2011-11-01
     HADOOP-7797. Fix top-level pom.xml to refer to correct staging maven
     repository. (omalley via acmurthy) 
 
+    HADOOP-7101. UserGroupInformation.getCurrentUser() fails when called from
+    non-Hadoop JAAS context. (todd)
+
 Release 0.22.1 - Unreleased
 
   INCOMPATIBLE CHANGES
@@ -3255,9 +3292,6 @@ Release 0.22.0 - 2011-11-29
 
     HADOOP-7093. Servlets should default to text/plain (todd)
 
-    HADOOP-7101. UserGroupInformation.getCurrentUser() fails when called from
-    non-Hadoop JAAS context. (todd)
-
     HADOOP-7089. Fix link resolution logic in hadoop-config.sh. (eli)
 
     HADOOP-7046. Fix Findbugs warning in Configuration. (Po Cheung via shv)

+ 4 - 0
hadoop-common-project/hadoop-common/pom.xml

@@ -223,6 +223,10 @@
           <groupId>jline</groupId>
           <artifactId>jline</artifactId>
         </exclusion>
+        <exclusion>
+          <groupId>org.jboss.netty</groupId>
+          <artifactId>netty</artifactId>
+        </exclusion>
         <exclusion>
           <!-- otherwise seems to drag in junit 3.8.1 via jline -->
           <groupId>junit</groupId>

+ 7 - 2
hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/shell/CommandWithDestination.java

@@ -18,7 +18,6 @@
 
 package org.apache.hadoop.fs.shell;
 
-import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
 import java.net.URI;
@@ -224,7 +223,13 @@ abstract class CommandWithDestination extends FsCommand {
    */ 
   protected void copyFileToTarget(PathData src, PathData target) throws IOException {
     src.fs.setVerifyChecksum(verifyChecksum);
-    copyStreamToTarget(src.fs.open(src.path), target);
+    InputStream in = null;
+    try {
+      in = src.fs.open(src.path);
+      copyStreamToTarget(in, target);
+    } finally {
+      IOUtils.closeStream(in);
+    }
   }
   
   /**

+ 93 - 0
hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/SignalLogger.java

@@ -0,0 +1,93 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.util;
+
+import sun.misc.Signal;
+import sun.misc.SignalHandler;
+
+import org.apache.commons.logging.Log;
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+
+/**
+ * This class logs a message whenever we're about to exit on a UNIX signal.
+ * This is helpful for determining the root cause of a process' exit.
+ * For example, if the process exited because the system administrator 
+ * ran a standard "kill," you would see 'EXITING ON SIGNAL SIGTERM' in the log.
+ */
+@InterfaceAudience.Private
+@InterfaceStability.Unstable
+public enum SignalLogger {
+  INSTANCE;
+
+  private boolean registered = false;
+
+  /**
+   * Our signal handler.
+   */
+  private static class Handler implements SignalHandler {
+    final private org.apache.commons.logging.Log LOG;
+    final private SignalHandler prevHandler;
+
+    Handler(String name, Log LOG) {
+      this.LOG = LOG;
+      prevHandler = Signal.handle(new Signal(name), this);
+    }
+
+    /**
+     * Handle an incoming signal.
+     *
+     * @param signal    The incoming signal
+     */
+    @Override
+    public void handle(Signal signal) {
+      LOG.error("RECEIVED SIGNAL " + signal.getNumber() +
+          ": SIG" + signal.getName());
+      prevHandler.handle(signal);
+    }
+  }
+
+  /**
+   * Register some signal handlers.
+   *
+   * @param LOG        The log4j logfile to use in the signal handlers.
+   */
+  public void register(final Log LOG) {
+    if (registered) {
+      throw new IllegalStateException("Can't re-install the signal handlers.");
+    }
+    registered = true;
+    StringBuilder bld = new StringBuilder();
+    bld.append("registered UNIX signal handlers for [");
+    final String SIGNALS[] = { "TERM", "HUP", "INT" };
+    String separator = "";
+    for (String signalName : SIGNALS) {
+      try {
+        new Handler(signalName, LOG);
+        bld.append(separator);
+        bld.append(signalName);
+        separator = ", ";
+      } catch (Exception e) {
+        LOG.debug(e);
+      }
+    }
+    bld.append("]");
+    LOG.info(bld.toString());
+  }
+}

+ 8 - 0
hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/StringUtils.java

@@ -35,6 +35,7 @@ import java.util.StringTokenizer;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
+import org.apache.commons.lang.SystemUtils;
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.classification.InterfaceStability;
 import org.apache.hadoop.fs.Path;
@@ -613,6 +614,13 @@ public class StringUtils {
         )
       );
 
+    if (SystemUtils.IS_OS_UNIX) {
+      try {
+        SignalLogger.INSTANCE.register(LOG);
+      } catch (Throwable t) {
+        LOG.warn("failed to register any UNIX signal loggers: ", t);
+      }
+    }
     ShutdownHookManager.get().addShutdownHook(
       new Runnable() {
         @Override

+ 1 - 1
hadoop-common-project/hadoop-common/src/main/resources/core-default.xml

@@ -699,7 +699,7 @@
   <name>net.topology.table.file.name</name>
   <value></value>
   <description> The file name for a topology file, which is used when the
-    net.topology.script.file.name property is set to
+    net.topology.node.switch.mapping.impl property is set to
     org.apache.hadoop.net.TableMapping. The file format is a two column text
     file, with columns separated by whitespace. The first column is a DNS or
     IP address and the second column specifies the rack where the address maps.

+ 68 - 61
hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ipc/TestRPC.java

@@ -834,23 +834,27 @@ public class TestRPC {
       TestProtocol.class, new TestImpl(), ADDRESS, 0, 5, true, conf, null
     );
     server.start();
-    InetSocketAddress addr = NetUtils.getConnectAddress(server);
-
-    final TestProtocol proxy = (TestProtocol) RPC.getProxy(
-        TestProtocol.class, TestProtocol.versionID, addr, conf);
-    // Connect to the server
-    proxy.ping();
-    // Interrupt self, try another call
-    Thread.currentThread().interrupt();
     try {
+      InetSocketAddress addr = NetUtils.getConnectAddress(server);
+  
+      final TestProtocol proxy = (TestProtocol) RPC.getProxy(
+          TestProtocol.class, TestProtocol.versionID, addr, conf);
+      // Connect to the server
       proxy.ping();
-      fail("Interruption did not cause IPC to fail");
-    } catch (IOException ioe) {
-      if (!ioe.toString().contains("InterruptedException")) {
-        throw ioe;
+      // Interrupt self, try another call
+      Thread.currentThread().interrupt();
+      try {
+        proxy.ping();
+        fail("Interruption did not cause IPC to fail");
+      } catch (IOException ioe) {
+        if (!ioe.toString().contains("InterruptedException")) {
+          throw ioe;
+        }
+        // clear interrupt status for future tests
+        Thread.interrupted();
       }
-      // clear interrupt status for future tests
-      Thread.interrupted();
+    } finally {
+      server.stop();
     }
   }
   
@@ -862,59 +866,62 @@ public class TestRPC {
     );
 
     server.start();
-
-    int numConcurrentRPC = 200;
-    InetSocketAddress addr = NetUtils.getConnectAddress(server);
-    final CyclicBarrier barrier = new CyclicBarrier(numConcurrentRPC);
-    final CountDownLatch latch = new CountDownLatch(numConcurrentRPC);
-    final AtomicBoolean leaderRunning = new AtomicBoolean(true);
-    final AtomicReference<Throwable> error = new AtomicReference<Throwable>();
-    Thread leaderThread = null;
-    
-    for (int i = 0; i < numConcurrentRPC; i++) {
-      final int num = i;
-      final TestProtocol proxy = (TestProtocol) RPC.getProxy(
-      TestProtocol.class, TestProtocol.versionID, addr, conf);
-      Thread rpcThread = new Thread(new Runnable() {
-        @Override
-        public void run() {
-          try {
-            barrier.await();
-            while (num == 0 || leaderRunning.get()) {
+    try {
+      int numConcurrentRPC = 200;
+      InetSocketAddress addr = NetUtils.getConnectAddress(server);
+      final CyclicBarrier barrier = new CyclicBarrier(numConcurrentRPC);
+      final CountDownLatch latch = new CountDownLatch(numConcurrentRPC);
+      final AtomicBoolean leaderRunning = new AtomicBoolean(true);
+      final AtomicReference<Throwable> error = new AtomicReference<Throwable>();
+      Thread leaderThread = null;
+      
+      for (int i = 0; i < numConcurrentRPC; i++) {
+        final int num = i;
+        final TestProtocol proxy = (TestProtocol) RPC.getProxy(
+        TestProtocol.class, TestProtocol.versionID, addr, conf);
+        Thread rpcThread = new Thread(new Runnable() {
+          @Override
+          public void run() {
+            try {
+              barrier.await();
+              while (num == 0 || leaderRunning.get()) {
+                proxy.slowPing(false);
+              }
+  
               proxy.slowPing(false);
+            } catch (Exception e) {
+              if (num == 0) {
+                leaderRunning.set(false);
+              } else {
+                error.set(e);
+              }
+  
+              LOG.error(e);
+            } finally {
+              latch.countDown();
             }
-
-            proxy.slowPing(false);
-          } catch (Exception e) {
-            if (num == 0) {
-              leaderRunning.set(false);
-            } else {
-              error.set(e);
-            }
-
-            LOG.error(e);
-          } finally {
-            latch.countDown();
           }
+        });
+        rpcThread.start();
+  
+        if (leaderThread == null) {
+         leaderThread = rpcThread;
         }
-      });
-      rpcThread.start();
-
-      if (leaderThread == null) {
-       leaderThread = rpcThread;
       }
+      // let threads get past the barrier
+      Thread.sleep(1000);
+      // stop a single thread
+      while (leaderRunning.get()) {
+        leaderThread.interrupt();
+      }
+      
+      latch.await();
+      
+      // should not cause any other thread to get an error
+      assertTrue("rpc got exception " + error.get(), error.get() == null);
+    } finally {
+      server.stop();
     }
-    // let threads get past the barrier
-    Thread.sleep(1000);
-    // stop a single thread
-    while (leaderRunning.get()) {
-      leaderThread.interrupt();
-    }
-    
-    latch.await();
-    
-    // should not cause any other thread to get an error
-    assertTrue("rpc got exception " + error.get(), error.get() == null);
   }
 
   public static void main(String[] args) throws Exception {

+ 1 - 1
hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/net/TestNetUtils.java

@@ -605,7 +605,7 @@ public class TestNetUtils {
   @Test
   public void testNormalizeHostName() {	
     List<String> hosts = Arrays.asList(new String[] {"127.0.0.1",
-        "localhost", "3w.org", "UnknownHost"});
+        "localhost", "3w.org", "UnknownHost123"});
     List<String> normalizedHosts = NetUtils.normalizeHostNames(hosts);
     // when ipaddress is normalized, same address is expected in return
     assertEquals(normalizedHosts.get(0), hosts.get(0));

+ 13 - 0
hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/test/GenericTestUtils.java

@@ -162,6 +162,9 @@ public abstract class GenericTestUtils {
     private final CountDownLatch waitLatch = new CountDownLatch(1);
     private final CountDownLatch resultLatch = new CountDownLatch(1);
     
+    private final AtomicInteger fireCounter = new AtomicInteger(0);
+    private final AtomicInteger resultCounter = new AtomicInteger(0);
+    
     // Result fields set after proceed() is called.
     private volatile Throwable thrown;
     private volatile Object returnValue;
@@ -188,6 +191,7 @@ public abstract class GenericTestUtils {
     @Override
     public Object answer(InvocationOnMock invocation) throws Throwable {
       LOG.info("DelayAnswer firing fireLatch");
+      fireCounter.getAndIncrement();
       fireLatch.countDown();
       try {
         LOG.info("DelayAnswer waiting on waitLatch");
@@ -208,6 +212,7 @@ public abstract class GenericTestUtils {
         thrown = t;
         throw t;
       } finally {
+        resultCounter.incrementAndGet();
         resultLatch.countDown();
       }
     }
@@ -235,6 +240,14 @@ public abstract class GenericTestUtils {
     public Object getReturnValue() {
       return returnValue;
     }
+    
+    public int getFireCount() {
+      return fireCounter.get();
+    }
+    
+    public int getResultCount() {
+      return resultCounter.get();
+    }
   }
   
   /**

+ 42 - 0
hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/util/TestSignalLogger.java

@@ -0,0 +1,42 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.util;
+
+import org.apache.commons.lang.SystemUtils;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.junit.Assert;
+import org.junit.Assume;
+import org.junit.Test;
+
+public class TestSignalLogger {
+  public static final Log LOG = LogFactory.getLog(TestSignalLogger.class);
+  
+  @Test(timeout=60000)
+  public void testInstall() throws Exception {
+    Assume.assumeTrue(SystemUtils.IS_OS_UNIX);
+    SignalLogger.INSTANCE.register(LOG);
+    try {
+      SignalLogger.INSTANCE.register(LOG);
+      Assert.fail("expected IllegalStateException from double registration");
+    } catch (IllegalStateException e) {
+      // fall through
+    }
+  }
+}

+ 1 - 1
hadoop-dist/pom.xml

@@ -152,7 +152,7 @@
                       }
 
                       run tar cf hadoop-${project.version}.tar hadoop-${project.version}
-                      run gzip hadoop-${project.version}.tar
+                      run gzip -f hadoop-${project.version}.tar
                       echo
                       echo "Hadoop dist tar available at: ${project.build.directory}/hadoop-${project.version}.tar.gz"
                       echo

+ 1 - 1
hadoop-hdfs-project/hadoop-hdfs-httpfs/src/main/java/org/apache/hadoop/lib/server/Server.java

@@ -232,7 +232,7 @@ public class Server {
    * path.
    */
   private String checkAbsolutePath(String value, String name) {
-    if (!value.startsWith("/")) {
+    if (!new File(value).isAbsolute()) {
       throw new IllegalArgumentException(
         MessageFormat.format("[{0}] must be an absolute path [{1}]", name, value));
     }

+ 1 - 1
hadoop-hdfs-project/hadoop-hdfs-httpfs/src/test/java/org/apache/hadoop/fs/http/client/BaseTestHttpFSWith.java

@@ -364,7 +364,7 @@ public abstract class BaseTestHttpFSWith extends HFSTestCase {
     }
   }
 
-  private void testSetPermission() throws Exception {
+  protected void testSetPermission() throws Exception {
     FileSystem fs = FileSystem.get(getProxiedFSConf());
     Path path = new Path(getProxiedFSTestDir(), "foodir");
     fs.mkdirs(path);

+ 30 - 15
hadoop-hdfs-project/hadoop-hdfs-httpfs/src/test/java/org/apache/hadoop/fs/http/client/TestHttpFSFileSystemLocalFileSystem.java

@@ -20,8 +20,13 @@ package org.apache.hadoop.fs.http.client;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.CommonConfigurationKeysPublic;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.permission.FsAction;
+import org.apache.hadoop.fs.permission.FsPermission;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.test.TestDirHelper;
+import org.junit.Assert;
 import org.junit.runner.RunWith;
 import org.junit.runners.Parameterized;
 
@@ -65,21 +70,31 @@ public class TestHttpFSFileSystemLocalFileSystem extends BaseTestHttpFSWith {
   }
 
   protected Path addPrefix(Path path) {
-    URI uri = path.toUri();
-    try {
-      if (uri.getAuthority() != null) {
-        uri = new URI(uri.getScheme(),
-                      uri.getAuthority(), PATH_PREFIX + uri.getPath());
-      }
-      else {
-        if (uri.getPath().startsWith("/")) {
-          uri = new URI(PATH_PREFIX + uri.getPath());
-        }
-      }
-    } catch (URISyntaxException ex) {
-      throw new RuntimeException("It should not happen: " + ex.toString(), ex);
-    }
-    return new Path(uri);
+    return Path.mergePaths(new Path(PATH_PREFIX), path);
   }
 
+  @Override
+  protected void testSetPermission() throws Exception {
+    if (Path.WINDOWS) {
+      FileSystem fs = FileSystem.get(getProxiedFSConf());
+      Path path = new Path(getProxiedFSTestDir(), "foodir");
+      fs.mkdirs(path);
+
+      fs = getHttpFSFileSystem();
+      FsPermission permission1 = new FsPermission(FsAction.READ_WRITE, FsAction.NONE, FsAction.NONE);
+      fs.setPermission(path, permission1);
+      fs.close();
+
+      fs = FileSystem.get(getProxiedFSConf());
+      FileStatus status1 = fs.getFileStatus(path);
+      fs.close();
+      FsPermission permission2 = status1.getPermission();
+      Assert.assertEquals(permission2, permission1);
+
+      // sticky bit not supported on Windows with local file system, so the
+      // subclass skips that part of the test
+    } else {
+      super.testSetPermission();
+    }
+  }
 }

+ 24 - 10
hadoop-hdfs-project/hadoop-hdfs-httpfs/src/test/java/org/apache/hadoop/lib/server/TestServer.java

@@ -42,6 +42,7 @@ import org.apache.hadoop.test.HTestCase;
 import org.apache.hadoop.test.TestDir;
 import org.apache.hadoop.test.TestDirHelper;
 import org.apache.hadoop.test.TestException;
+import org.apache.hadoop.util.Shell;
 import org.apache.hadoop.util.StringUtils;
 import org.junit.Test;
 
@@ -50,21 +51,24 @@ public class TestServer extends HTestCase {
   @Test
   @TestDir
   public void constructorsGetters() throws Exception {
-    Server server = new Server("server", "/a", "/b", "/c", "/d", new Configuration(false));
-    assertEquals(server.getHomeDir(), "/a");
-    assertEquals(server.getConfigDir(), "/b");
-    assertEquals(server.getLogDir(), "/c");
-    assertEquals(server.getTempDir(), "/d");
+    Server server = new Server("server", getAbsolutePath("/a"),
+      getAbsolutePath("/b"), getAbsolutePath("/c"), getAbsolutePath("/d"),
+      new Configuration(false));
+    assertEquals(server.getHomeDir(), getAbsolutePath("/a"));
+    assertEquals(server.getConfigDir(), getAbsolutePath("/b"));
+    assertEquals(server.getLogDir(), getAbsolutePath("/c"));
+    assertEquals(server.getTempDir(), getAbsolutePath("/d"));
     assertEquals(server.getName(), "server");
     assertEquals(server.getPrefix(), "server");
     assertEquals(server.getPrefixedName("name"), "server.name");
     assertNotNull(server.getConfig());
 
-    server = new Server("server", "/a", "/b", "/c", "/d");
-    assertEquals(server.getHomeDir(), "/a");
-    assertEquals(server.getConfigDir(), "/b");
-    assertEquals(server.getLogDir(), "/c");
-    assertEquals(server.getTempDir(), "/d");
+    server = new Server("server", getAbsolutePath("/a"), getAbsolutePath("/b"),
+      getAbsolutePath("/c"), getAbsolutePath("/d"));
+    assertEquals(server.getHomeDir(), getAbsolutePath("/a"));
+    assertEquals(server.getConfigDir(), getAbsolutePath("/b"));
+    assertEquals(server.getLogDir(), getAbsolutePath("/c"));
+    assertEquals(server.getTempDir(), getAbsolutePath("/d"));
     assertEquals(server.getName(), "server");
     assertEquals(server.getPrefix(), "server");
     assertEquals(server.getPrefixedName("name"), "server.name");
@@ -793,4 +797,14 @@ public class TestServer extends HTestCase {
     server.destroy();
   }
 
+  /**
+   * Creates an absolute path by appending the given relative path to the test
+   * root.
+   * 
+   * @param relativePath String relative path
+   * @return String absolute path formed by appending relative path to test root
+   */
+  private static String getAbsolutePath(String relativePath) {
+    return new File(TestDirHelper.getTestDir(), relativePath).getAbsolutePath();
+  }
 }

+ 4 - 1
hadoop-hdfs-project/hadoop-hdfs-httpfs/src/test/java/org/apache/hadoop/lib/servlet/TestHostnameFilter.java

@@ -50,7 +50,10 @@ public class TestHostnameFilter extends HTestCase {
       @Override
       public void doFilter(ServletRequest servletRequest, ServletResponse servletResponse)
         throws IOException, ServletException {
-        assertTrue(HostnameFilter.get().contains("localhost"));
+        // Hostname was set to "localhost", but may get resolved automatically to
+        // "127.0.0.1" depending on OS.
+        assertTrue(HostnameFilter.get().contains("localhost") ||
+          HostnameFilter.get().contains("127.0.0.1"));
         invoked.set(true);
       }
     };

+ 1 - 1
hadoop-hdfs-project/hadoop-hdfs-httpfs/src/test/java/org/apache/hadoop/test/TestDirHelper.java

@@ -63,7 +63,7 @@ public class TestDirHelper implements MethodRule {
   static {
     try {
       TEST_DIR_ROOT = System.getProperty(TEST_DIR_PROP, new File("target").getAbsolutePath());
-      if (!TEST_DIR_ROOT.startsWith("/")) {
+      if (!new File(TEST_DIR_ROOT).isAbsolute()) {
         System.err.println(MessageFormat.format("System property [{0}]=[{1}] must be set to an absolute path",
                                                 TEST_DIR_PROP, TEST_DIR_ROOT));
         System.exit(-1);

+ 2 - 1
hadoop-hdfs-project/hadoop-hdfs-httpfs/src/test/java/org/apache/hadoop/test/TestHdfsHelper.java

@@ -82,7 +82,8 @@ public class TestHdfsHelper extends TestDirHelper {
 
     private Path resetHdfsTestDir(Configuration conf) {
 
-      Path testDir = new Path("./" + TEST_DIR_ROOT, testName + "-" + counter.getAndIncrement());
+      Path testDir = new Path("/tmp/" + testName + "-" +
+        counter.getAndIncrement());
       try {
         // currentUser
         FileSystem fs = FileSystem.get(conf);

+ 44 - 0
hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt

@@ -309,6 +309,9 @@ Trunk (Unreleased)
     HDFS-4391. TestDataTransferKeepalive fails when tests are executed in a
     certain order. (Andrew Wang via atm)
 
+    HDFS-4586. TestDataDirs.testGetDataDirsFromURIs fails with all directories
+    in dfs.datanode.data.dir are invalid. (Ivan Mitic via atm)
+
   BREAKDOWN OF HADOOP-8562 SUBTASKS AND RELATED JIRAS
 
     HDFS-4145. Merge hdfs cmd line scripts from branch-1-win. (David Lao,
@@ -327,6 +330,16 @@ Trunk (Unreleased)
 
     HDFS-4572. Fix TestJournal failures on Windows. (Arpit Agarwal via suresh)
 
+    HDFS-4287. HTTPFS tests fail on Windows. (Chris Nauroth via suresh)
+
+    HDFS-4593. TestSaveNamespace fails on Windows. (Arpit Agarwal via suresh)
+
+    HDFS-4582. TestHostsFiles fails on Windows. (Ivan Mitic via suresh)
+
+    HDFS-4603. TestMiniDFSCluster fails on Windows. (Ivan Mitic via suresh)
+
+    HDFS-4604. TestJournalNode fails on Windows. (Ivan Mitic via suresh)
+
 Release 2.0.5-beta - UNRELEASED
 
   INCOMPATIBLE CHANGES
@@ -386,6 +399,31 @@ Release 2.0.5-beta - UNRELEASED
     HDFS-4571. WebHDFS should not set the service hostname on the server side. 
     (tucu)
 
+    HDFS-4013. TestHftpURLTimeouts throws NPE. (Chao Shi via suresh)
+
+    HDFS-4592. Default values for access time precision are out of sync between
+    hdfs-default.xml and the code. (atm)
+
+    HDFS-4522. LightWeightGSet expects incrementing a volatile to be atomic.
+    (Colin Patrick McCabe via atm)
+
+    HDFS-4484. libwebhdfs compilation broken with gcc 4.6.2. (Colin Patrick
+    McCabe via atm)
+
+    HDFS-4595. When short circuit read is fails, DFSClient does not fallback
+    to regular reads. (suresh)
+
+    HDFS-4583. TestNodeCount fails. (Ivan Mitic via suresh)
+
+    HDFS-4591. HA clients can fail to fail over while Standby NN is performing
+    long checkpoint. (atm)
+
+    HDFS-3277. fail over to loading a different FSImage if the first one we
+    try to load is corrupt. (Colin Patrick McCabe and Andrew Wang via atm)
+
+    HDFS-4596. Shutting down namenode during checkpointing can lead to md5sum
+    error. (Andrew Wang via atm)
+
 Release 2.0.4-alpha - UNRELEASED
 
   INCOMPATIBLE CHANGES
@@ -2394,6 +2432,12 @@ Release 0.23.7 - UNRELEASED
     HDFS-4577. Webhdfs operations should declare if authentication is required
     (daryn via kihwal)
 
+    HDFS-3344. Unreliable corrupt blocks counting in TestProcessCorruptBlocks
+    (kihwal)
+
+    HDFS-3367. WebHDFS doesn't use the logged in user when opening
+    connections (daryn)
+
 Release 0.23.6 - UNRELEASED
 
   INCOMPATIBLE CHANGES

+ 3 - 0
hadoop-hdfs-project/hadoop-hdfs/src/contrib/libwebhdfs/CMakeLists.txt

@@ -48,6 +48,7 @@ add_executable(test_libwebhdfs_ops
 )
 target_link_libraries(test_libwebhdfs_ops
     webhdfs
+    native_mini_dfs
 )
 
 add_executable(test_libwebhdfs_read
@@ -69,4 +70,6 @@ add_executable(test_libwebhdfs_threaded
 )
 target_link_libraries(test_libwebhdfs_threaded
     webhdfs
+    native_mini_dfs
+    pthread
 )

+ 6 - 5
hadoop-hdfs-project/hadoop-hdfs/src/contrib/libwebhdfs/src/test_libwebhdfs_ops.c

@@ -257,7 +257,8 @@ int main(int argc, char **argv)
 
         const char* path[] = {"/foo", "/foo/bar", "foobar", "//foo/bar//foobar",
                               "foo//bar", "foo/bar///", "/", "////"};
-        for (int i = 0; i < 8; i++) {
+        int i;
+        for (i = 0; i < 8; i++) {
             fprintf(stderr, "hdfsSetWorkingDirectory: %s, %s\n",
                     ((result = hdfsSetWorkingDirectory(fs, path[i])) ?
                      "Failed!" : "Success!"),
@@ -281,8 +282,8 @@ int main(int argc, char **argv)
             fprintf(stderr, "Name: %s, ", fileInfo->mName);
             fprintf(stderr, "Type: %c, ", (char)(fileInfo->mKind));
             fprintf(stderr, "Replication: %d, ", fileInfo->mReplication);
-            fprintf(stderr, "BlockSize: %lld, ", fileInfo->mBlockSize);
-            fprintf(stderr, "Size: %lld, ", fileInfo->mSize);
+            fprintf(stderr, "BlockSize: %"PRId64", ", fileInfo->mBlockSize);
+            fprintf(stderr, "Size: %"PRId64", ", fileInfo->mSize);
             fprintf(stderr, "LastMod: %s", ctime(&fileInfo->mLastMod));
             fprintf(stderr, "Owner: %s, ", fileInfo->mOwner);
             fprintf(stderr, "Group: %s, ", fileInfo->mGroup);
@@ -305,8 +306,8 @@ int main(int argc, char **argv)
                 fprintf(stderr, "Name: %s, ", fileList[i].mName);
                 fprintf(stderr, "Type: %c, ", (char)fileList[i].mKind);
                 fprintf(stderr, "Replication: %d, ", fileList[i].mReplication);
-                fprintf(stderr, "BlockSize: %lld, ", fileList[i].mBlockSize);
-                fprintf(stderr, "Size: %lld, ", fileList[i].mSize);
+                fprintf(stderr, "BlockSize: %"PRId64", ", fileList[i].mBlockSize);
+                fprintf(stderr, "Size: %"PRId64", ", fileList[i].mSize);
                 fprintf(stderr, "LastMod: %s", ctime(&fileList[i].mLastMod));
                 fprintf(stderr, "Owner: %s, ", fileList[i].mOwner);
                 fprintf(stderr, "Group: %s, ", fileList[i].mGroup);

+ 27 - 13
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/BlockReaderLocal.java

@@ -23,6 +23,7 @@ import java.io.FileInputStream;
 import java.io.IOException;
 import java.net.Socket;
 import java.nio.ByteBuffer;
+import java.security.PrivilegedExceptionAction;
 import java.util.Collections;
 import java.util.HashMap;
 import java.util.LinkedHashMap;
@@ -31,6 +32,7 @@ import java.util.Map;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.hdfs.protocol.BlockLocalPathInfo;
 import org.apache.hadoop.hdfs.protocol.ClientDatanodeProtocol;
 import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
@@ -41,6 +43,7 @@ import org.apache.hadoop.hdfs.server.datanode.BlockMetadataHeader;
 import org.apache.hadoop.hdfs.util.DirectBufferPool;
 import org.apache.hadoop.ipc.RPC;
 import org.apache.hadoop.io.IOUtils;
+import org.apache.hadoop.security.UserGroupInformation;
 import org.apache.hadoop.security.token.Token;
 import org.apache.hadoop.util.DataChecksum;
 
@@ -86,11 +89,21 @@ class BlockReaderLocal implements BlockReader {
     }
 
     private synchronized ClientDatanodeProtocol getDatanodeProxy(
-        DatanodeInfo node, Configuration conf, int socketTimeout,
-        boolean connectToDnViaHostname) throws IOException {
+        UserGroupInformation ugi, final DatanodeInfo node,
+        final Configuration conf, final int socketTimeout,
+        final boolean connectToDnViaHostname) throws IOException {
       if (proxy == null) {
-        proxy = DFSUtil.createClientDatanodeProtocolProxy(node, conf,
-            socketTimeout, connectToDnViaHostname);
+        try {
+          proxy = ugi.doAs(new PrivilegedExceptionAction<ClientDatanodeProtocol>() {
+            @Override
+            public ClientDatanodeProtocol run() throws Exception {
+              return DFSUtil.createClientDatanodeProtocolProxy(node, conf,
+                  socketTimeout, connectToDnViaHostname);
+            }
+          });
+        } catch (InterruptedException e) {
+          LOG.warn("encountered exception ", e);
+        }
       }
       return proxy;
     }
@@ -154,17 +167,18 @@ class BlockReaderLocal implements BlockReader {
   /**
    * The only way this object can be instantiated.
    */
-  static BlockReaderLocal newBlockReader(Configuration conf, String file,
-      ExtendedBlock blk, Token<BlockTokenIdentifier> token, DatanodeInfo node,
-      int socketTimeout, long startOffset, long length,
-      boolean connectToDnViaHostname) throws IOException {
+  static BlockReaderLocal newBlockReader(UserGroupInformation ugi,
+      Configuration conf, String file, ExtendedBlock blk,
+      Token<BlockTokenIdentifier> token, DatanodeInfo node, int socketTimeout,
+      long startOffset, long length, boolean connectToDnViaHostname)
+      throws IOException {
 
     LocalDatanodeInfo localDatanodeInfo = getLocalDatanodeInfo(node
         .getIpcPort());
     // check the cache first
     BlockLocalPathInfo pathinfo = localDatanodeInfo.getBlockLocalPathInfo(blk);
     if (pathinfo == null) {
-      pathinfo = getBlockPathInfo(blk, node, conf, socketTimeout, token,
+      pathinfo = getBlockPathInfo(ugi, blk, node, conf, socketTimeout, token,
           connectToDnViaHostname);
     }
 
@@ -241,13 +255,13 @@ class BlockReaderLocal implements BlockReader {
     return ldInfo;
   }
   
-  private static BlockLocalPathInfo getBlockPathInfo(ExtendedBlock blk,
-      DatanodeInfo node, Configuration conf, int timeout,
+  private static BlockLocalPathInfo getBlockPathInfo(UserGroupInformation ugi,
+      ExtendedBlock blk, DatanodeInfo node, Configuration conf, int timeout,
       Token<BlockTokenIdentifier> token, boolean connectToDnViaHostname)
-          throws IOException {
+      throws IOException {
     LocalDatanodeInfo localDatanodeInfo = getLocalDatanodeInfo(node.getIpcPort());
     BlockLocalPathInfo pathinfo = null;
-    ClientDatanodeProtocol proxy = localDatanodeInfo.getDatanodeProxy(node,
+    ClientDatanodeProtocol proxy = localDatanodeInfo.getDatanodeProxy(ugi, node,
         conf, timeout, connectToDnViaHostname);
     try {
       // make RPC to local datanode to find local pathnames of blocks

+ 14 - 6
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSClient.java

@@ -415,6 +415,7 @@ public class DFSClient implements java.io.Closeable {
           "null URI");
       NameNodeProxies.ProxyAndInfo<ClientProtocol> proxyInfo =
         NameNodeProxies.createProxy(conf, nameNodeUri, ClientProtocol.class);
+      
       this.dtService = proxyInfo.getDelegationTokenService();
       this.namenode = proxyInfo.getProxy();
     }
@@ -794,12 +795,13 @@ public class DFSClient implements java.io.Closeable {
   /**
    * Get {@link BlockReader} for short circuited local reads.
    */
-  static BlockReader getLocalBlockReader(Configuration conf,
-      String src, ExtendedBlock blk, Token<BlockTokenIdentifier> accessToken,
-      DatanodeInfo chosenNode, int socketTimeout, long offsetIntoBlock,
-      boolean connectToDnViaHostname) throws InvalidToken, IOException {
+  static BlockReader getLocalBlockReader(UserGroupInformation ugi,
+      Configuration conf, String src, ExtendedBlock blk,
+      Token<BlockTokenIdentifier> accessToken, DatanodeInfo chosenNode,
+      int socketTimeout, long offsetIntoBlock, boolean connectToDnViaHostname)
+      throws InvalidToken, IOException {
     try {
-      return BlockReaderLocal.newBlockReader(conf, src, blk, accessToken,
+      return BlockReaderLocal.newBlockReader(ugi, conf, src, blk, accessToken,
           chosenNode, socketTimeout, offsetIntoBlock, blk.getNumBytes()
               - offsetIntoBlock, connectToDnViaHostname);
     } catch (RemoteException re) {
@@ -1621,7 +1623,7 @@ public class DFSClient implements java.io.Closeable {
    * @param socketFactory to create sockets to connect to DNs
    * @param socketTimeout timeout to use when connecting and waiting for a response
    * @param encryptionKey the key needed to communicate with DNs in this cluster
-   * @param connectToDnViaHostname {@see #connectToDnViaHostname()}
+   * @param connectToDnViaHostname {@link #connectToDnViaHostname()}
    * @return The checksum 
    */
   static MD5MD5CRC32FileChecksum getFileChecksum(String src,
@@ -2323,6 +2325,12 @@ public class DFSClient implements java.io.Closeable {
   }
 
   void disableShortCircuit() {
+    LOG.info("Short circuit is disabled");
     shortCircuitLocalReads = false;
   }
+  
+  @VisibleForTesting
+  boolean getShortCircuitLocalReads() {
+    return shortCircuitLocalReads;
+  }
 }

+ 10 - 6
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSInputStream.java

@@ -460,6 +460,10 @@ public class DFSInputStream extends FSInputStream implements ByteBufferReadable
                              " for " + blk);
         }
         return chosenNode;
+      } catch (AccessControlException ex) {
+        DFSClient.LOG.warn("Short circuit access failed " + ex);
+        dfsClient.disableShortCircuit();
+        continue;
       } catch (IOException ex) {
         if (ex instanceof InvalidEncryptionKeyException && refetchEncryptionKey > 0) {
           DFSClient.LOG.info("Will fetch a new encryption key and retry, " 
@@ -806,7 +810,7 @@ public class DFSInputStream extends FSInputStream implements ByteBufferReadable
         // we want to remember what we have tried
         addIntoCorruptedBlockMap(block.getBlock(), chosenNode, corruptedBlockMap);
       } catch (AccessControlException ex) {
-        DFSClient.LOG.warn("Short circuit access failed ", ex);
+        DFSClient.LOG.warn("Short circuit access failed " + ex);
         dfsClient.disableShortCircuit();
         continue;
       } catch (IOException e) {
@@ -885,9 +889,9 @@ public class DFSInputStream extends FSInputStream implements ByteBufferReadable
     // Can't local read a block under construction, see HDFS-2757
     if (dfsClient.shouldTryShortCircuitRead(dnAddr) &&
         !blockUnderConstruction()) {
-      return DFSClient.getLocalBlockReader(dfsClient.conf, src, block,
-          blockToken, chosenNode, dfsClient.hdfsTimeout, startOffset,
-          dfsClient.connectToDnViaHostname());
+      return DFSClient.getLocalBlockReader(dfsClient.ugi, dfsClient.conf,
+          src, block, blockToken, chosenNode, dfsClient.hdfsTimeout,
+          startOffset, dfsClient.connectToDnViaHostname());
     }
     
     IOException err = null;
@@ -1027,8 +1031,8 @@ public class DFSInputStream extends FSInputStream implements ByteBufferReadable
    * only report if the total number of replica is 1. We do not
    * report otherwise since this maybe due to the client is a handicapped client
    * (who can not read).
-   * @param corruptedBlockMap, map of corrupted blocks
-   * @param dataNodeCount, number of data nodes who contains the block replicas
+   * @param corruptedBlockMap map of corrupted blocks
+   * @param dataNodeCount number of data nodes who contains the block replicas
    */
   private void reportCheckSumFailure(
       Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap, 

+ 3 - 2
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DistributedFileSystem.java

@@ -70,6 +70,8 @@ import org.apache.hadoop.security.token.SecretManager.InvalidToken;
 import org.apache.hadoop.security.token.Token;
 import org.apache.hadoop.util.Progressable;
 
+import com.google.common.annotations.VisibleForTesting;
+
 
 /****************************************************************
  * Implementation of the abstract FileSystem for the DFS system.
@@ -567,9 +569,8 @@ public class DistributedFileSystem extends FileSystem {
     return "DFS[" + dfs + "]";
   }
 
-  /** @deprecated DFSClient should not be accessed directly. */
   @InterfaceAudience.Private
-  @Deprecated
+  @VisibleForTesting
   public DFSClient getClient() {
     return dfs;
   }        

+ 2 - 14
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/qjournal/server/JournalNode.java

@@ -35,6 +35,7 @@ import org.apache.hadoop.io.IOUtils;
 import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
 import org.apache.hadoop.metrics2.source.JvmMetrics;
 import org.apache.hadoop.security.SecurityUtil;
+import org.apache.hadoop.util.DiskChecker;
 import org.apache.hadoop.util.StringUtils;
 import org.apache.hadoop.util.Tool;
 import org.apache.hadoop.util.ToolRunner;
@@ -82,7 +83,6 @@ public class JournalNode implements Tool, Configurable {
     return journal;
   }
 
-
   @Override
   public void setConf(Configuration conf) {
     this.conf = conf;
@@ -97,21 +97,9 @@ public class JournalNode implements Tool, Configurable {
           "Journal dir '" + dir + "' should be an absolute path");
     }
 
-    if (!dir.exists() && !dir.mkdirs()) {
-      throw new IOException("Could not create journal dir '" +
-          dir + "'");
-    } else if (!dir.isDirectory()) {
-      throw new IOException("Journal directory '" + dir + "' is not " +
-          "a directory");
-    }
-    
-    if (!dir.canWrite()) {
-      throw new IOException("Unable to write to journal dir '" +
-          dir + "'");
-    }
+    DiskChecker.checkDir(dir);
   }
 
-
   @Override
   public Configuration getConf() {
     return conf;

+ 1 - 0
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/security/token/delegation/DelegationTokenSecretManager.java

@@ -78,6 +78,7 @@ public class DelegationTokenSecretManager
   
   @Override //SecretManager
   public void checkAvailableForRead() throws StandbyException {
+    namesystem.checkOperation(OperationCategory.READ);
     namesystem.readLock();
     try {
       namesystem.checkOperation(OperationCategory.READ);

+ 3 - 1
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java

@@ -62,6 +62,7 @@ import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.ReplicaState;
 import org.apache.hadoop.hdfs.server.namenode.FSClusterStats;
 import org.apache.hadoop.hdfs.server.namenode.NameNode;
 import org.apache.hadoop.hdfs.server.namenode.Namesystem;
+import org.apache.hadoop.hdfs.server.namenode.NameNode.OperationCategory;
 import org.apache.hadoop.hdfs.server.namenode.metrics.NameNodeMetrics;
 import org.apache.hadoop.hdfs.server.protocol.BlockCommand;
 import org.apache.hadoop.hdfs.server.protocol.BlocksWithLocations;
@@ -874,9 +875,10 @@ public class BlockManager {
    */
   public BlocksWithLocations getBlocks(DatanodeID datanode, long size
       ) throws IOException {
+    namesystem.checkOperation(OperationCategory.READ);
     namesystem.readLock();
     try {
-      namesystem.checkSuperuserPrivilege();
+      namesystem.checkOperation(OperationCategory.READ);
       return getBlocksWithLocations(datanode, size);  
     } finally {
       namesystem.readUnlock();

+ 22 - 3
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNode.java

@@ -1617,6 +1617,21 @@ public class DataNode extends Configured
     }
   }
 
+  // Small wrapper around the DiskChecker class that provides means to mock
+  // DiskChecker static methods and unittest DataNode#getDataDirsFromURIs.
+  static class DataNodeDiskChecker {
+    private FsPermission expectedPermission;
+
+    public DataNodeDiskChecker(FsPermission expectedPermission) {
+      this.expectedPermission = expectedPermission;
+    }
+
+    public void checkDir(LocalFileSystem localFS, Path path)
+        throws DiskErrorException, IOException {
+      DiskChecker.checkDir(localFS, path, expectedPermission);
+    }
+  }
+
   /**
    * Make an instance of DataNode after ensuring that at least one of the
    * given data directories (and their parent directories, if necessary)
@@ -1635,7 +1650,10 @@ public class DataNode extends Configured
     FsPermission permission = new FsPermission(
         conf.get(DFS_DATANODE_DATA_DIR_PERMISSION_KEY,
                  DFS_DATANODE_DATA_DIR_PERMISSION_DEFAULT));
-    ArrayList<File> dirs = getDataDirsFromURIs(dataDirs, localFS, permission);
+    DataNodeDiskChecker dataNodeDiskChecker =
+        new DataNodeDiskChecker(permission);
+    ArrayList<File> dirs =
+        getDataDirsFromURIs(dataDirs, localFS, dataNodeDiskChecker);
     DefaultMetricsSystem.initialize("DataNode");
 
     assert dirs.size() > 0 : "number of data directories should be > 0";
@@ -1644,7 +1662,8 @@ public class DataNode extends Configured
 
   // DataNode ctor expects AbstractList instead of List or Collection...
   static ArrayList<File> getDataDirsFromURIs(Collection<URI> dataDirs,
-      LocalFileSystem localFS, FsPermission permission) throws IOException {
+      LocalFileSystem localFS, DataNodeDiskChecker dataNodeDiskChecker)
+          throws IOException {
     ArrayList<File> dirs = new ArrayList<File>();
     StringBuilder invalidDirs = new StringBuilder();
     for (URI dirURI : dataDirs) {
@@ -1656,7 +1675,7 @@ public class DataNode extends Configured
       // drop any (illegal) authority in the URI for backwards compatibility
       File dir = new File(dirURI.getPath());
       try {
-        DiskChecker.checkDir(localFS, new Path(dir.toURI()), permission);
+        dataNodeDiskChecker.checkDir(localFS, new Path(dir.toURI()));
         dirs.add(dir);
       } catch (IOException ioe) {
         LOG.warn("Invalid " + DFS_DATANODE_DATA_DIR_KEY + " "

+ 1 - 0
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/CheckpointFaultInjector.java

@@ -44,4 +44,5 @@ class CheckpointFaultInjector {
     return false;
   }
   
+  public void afterMD5Rename() throws IOException {}
 }

+ 52 - 31
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSImage.java

@@ -50,6 +50,7 @@ import org.apache.hadoop.hdfs.server.common.Storage.FormatConfirmable;
 import org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory;
 import org.apache.hadoop.hdfs.server.common.Storage.StorageState;
 import org.apache.hadoop.hdfs.server.common.Util;
+import org.apache.hadoop.hdfs.server.namenode.FSImageStorageInspector.FSImageFile;
 import org.apache.hadoop.hdfs.server.namenode.NNStorage.NameNodeDirType;
 import org.apache.hadoop.hdfs.server.namenode.NNStorage.NameNodeFile;
 import org.apache.hadoop.hdfs.server.protocol.CheckpointCommand;
@@ -582,11 +583,11 @@ public class FSImage implements Closeable {
   boolean loadFSImage(FSNamesystem target, MetaRecoveryContext recovery)
       throws IOException {
     FSImageStorageInspector inspector = storage.readAndInspectDirs();
+    FSImageFile imageFile = null;
     
     isUpgradeFinalized = inspector.isUpgradeFinalized();
  
-    FSImageStorageInspector.FSImageFile imageFile 
-      = inspector.getLatestImage();   
+    List<FSImageFile> imageFiles = inspector.getLatestImages();
     boolean needToSave = inspector.needToSave();
 
     Iterable<EditLogInputStream> editStreams = null;
@@ -599,7 +600,8 @@ public class FSImage implements Closeable {
       // we better be able to load all the edits. If we're the standby NN, it's
       // OK to not be able to read all of edits right now.
       long toAtLeastTxId = editLog.isOpenForWrite() ? inspector.getMaxSeenTxId() : 0;
-      editStreams = editLog.selectInputStreams(imageFile.getCheckpointTxId() + 1,
+      editStreams = editLog.selectInputStreams(
+          imageFiles.get(0).getCheckpointTxId() + 1,
           toAtLeastTxId, recovery, false);
     } else {
       editStreams = FSImagePreTransactionalStorageInspector
@@ -612,7 +614,6 @@ public class FSImage implements Closeable {
       elis.setMaxOpSize(maxOpSize);
     }
  
-    LOG.debug("Planning to load image :\n" + imageFile);
     for (EditLogInputStream l : editStreams) {
       LOG.debug("Planning to load edit log stream: " + l);
     }
@@ -620,34 +621,21 @@ public class FSImage implements Closeable {
       LOG.info("No edit log streams selected.");
     }
     
-    try {
-      StorageDirectory sdForProperties = imageFile.sd;
-      storage.readProperties(sdForProperties);
-
-      if (LayoutVersion.supports(Feature.TXID_BASED_LAYOUT,
-                                 getLayoutVersion())) {
-        // For txid-based layout, we should have a .md5 file
-        // next to the image file
-        loadFSImage(imageFile.getFile(), target, recovery);
-      } else if (LayoutVersion.supports(Feature.FSIMAGE_CHECKSUM,
-                                        getLayoutVersion())) {
-        // In 0.22, we have the checksum stored in the VERSION file.
-        String md5 = storage.getDeprecatedProperty(
-            NNStorage.DEPRECATED_MESSAGE_DIGEST_PROPERTY);
-        if (md5 == null) {
-          throw new InconsistentFSStateException(sdForProperties.getRoot(),
-              "Message digest property " +
-              NNStorage.DEPRECATED_MESSAGE_DIGEST_PROPERTY +
-              " not set for storage directory " + sdForProperties.getRoot());
-        }
-        loadFSImage(imageFile.getFile(), new MD5Hash(md5), target, recovery);
-      } else {
-        // We don't have any record of the md5sum
-        loadFSImage(imageFile.getFile(), null, target, recovery);
+    for (int i = 0; i < imageFiles.size(); i++) {
+      try {
+        imageFile = imageFiles.get(i);
+        loadFSImageFile(target, recovery, imageFile);
+        break;
+      } catch (IOException ioe) {
+        LOG.error("Failed to load image from " + imageFile, ioe);
+        target.clear();
+        imageFile = null;
       }
-    } catch (IOException ioe) {
+    }
+    // Failed to load any images, error out
+    if (imageFile == null) {
       FSEditLog.closeAllStreams(editStreams);
-      throw new IOException("Failed to load image from " + imageFile, ioe);
+      throw new IOException("Failed to load an FSImage file!");
     }
     long txnsAdvanced = loadEdits(editStreams, target, recovery);
     needToSave |= needsResaveBasedOnStaleCheckpoint(imageFile.getFile(),
@@ -656,6 +644,35 @@ public class FSImage implements Closeable {
     return needToSave;
   }
 
+  void loadFSImageFile(FSNamesystem target, MetaRecoveryContext recovery,
+      FSImageFile imageFile) throws IOException {
+    LOG.debug("Planning to load image :\n" + imageFile);
+    StorageDirectory sdForProperties = imageFile.sd;
+    storage.readProperties(sdForProperties);
+
+    if (LayoutVersion.supports(Feature.TXID_BASED_LAYOUT,
+                               getLayoutVersion())) {
+      // For txid-based layout, we should have a .md5 file
+      // next to the image file
+      loadFSImage(imageFile.getFile(), target, recovery);
+    } else if (LayoutVersion.supports(Feature.FSIMAGE_CHECKSUM,
+                                      getLayoutVersion())) {
+      // In 0.22, we have the checksum stored in the VERSION file.
+      String md5 = storage.getDeprecatedProperty(
+          NNStorage.DEPRECATED_MESSAGE_DIGEST_PROPERTY);
+      if (md5 == null) {
+        throw new InconsistentFSStateException(sdForProperties.getRoot(),
+            "Message digest property " +
+            NNStorage.DEPRECATED_MESSAGE_DIGEST_PROPERTY +
+            " not set for storage directory " + sdForProperties.getRoot());
+      }
+      loadFSImage(imageFile.getFile(), new MD5Hash(md5), target, recovery);
+    } else {
+      // We don't have any record of the md5sum
+      loadFSImage(imageFile.getFile(), null, target, recovery);
+    }
+  }
+
   public void initEditLog() {
     Preconditions.checkState(getNamespaceID() != 0,
         "Must know namespace ID before initting edit log");
@@ -1131,7 +1148,7 @@ public class FSImage implements Closeable {
    */
   public synchronized void saveDigestAndRenameCheckpointImage(
       long txid, MD5Hash digest) throws IOException {
-    renameCheckpoint(txid);
+    // Write and rename MD5 file
     List<StorageDirectory> badSds = Lists.newArrayList();
     
     for (StorageDirectory sd : storage.dirIterable(NameNodeDirType.IMAGE)) {
@@ -1144,6 +1161,10 @@ public class FSImage implements Closeable {
     }
     storage.reportErrorsOnDirectories(badSds);
     
+    CheckpointFaultInjector.getInstance().afterMD5Rename();
+    
+    // Rename image from tmp file
+    renameCheckpoint(txid);
     // So long as this is the newest image available,
     // advertise it as such to other checkpointers
     // from now on

+ 2 - 2
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSImageFormat.java

@@ -300,8 +300,8 @@ public class FSImageFormat {
         loadSecretManagerState(in);
 
         // make sure to read to the end of file
-        int eof = in.read();
-        assert eof == -1 : "Should have reached the end of image file " + curFile;
+        boolean eof = (in.read() == -1);
+        assert eof : "Should have reached the end of image file " + curFile;
       } finally {
         in.close();
       }

+ 6 - 2
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSImagePreTransactionalStorageInspector.java

@@ -25,6 +25,7 @@ import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.HashSet;
+import java.util.LinkedList;
 import java.util.List;
 import java.util.Set;
 
@@ -146,7 +147,7 @@ class FSImagePreTransactionalStorageInspector extends FSImageStorageInspector {
   }
     
   @Override
-  FSImageFile getLatestImage() throws IOException {
+  List<FSImageFile> getLatestImages() throws IOException {
     // We should have at least one image and one edits dirs
     if (latestNameSD == null)
       throw new IOException("Image file is not found in " + imageDirs);
@@ -176,9 +177,12 @@ class FSImagePreTransactionalStorageInspector extends FSImageStorageInspector {
 
     needToSaveAfterRecovery = doRecovery();
     
-    return new FSImageFile(latestNameSD, 
+    FSImageFile file = new FSImageFile(latestNameSD, 
         NNStorage.getStorageFile(latestNameSD, NameNodeFile.IMAGE),
         HdfsConstants.INVALID_TXID);
+    LinkedList<FSImageFile> ret = new LinkedList<FSImageFile>();
+    ret.add(file);
+    return ret;
   }
 
   @Override

+ 3 - 1
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSImageStorageInspector.java

@@ -19,6 +19,8 @@ package org.apache.hadoop.hdfs.server.namenode;
 
 import java.io.File;
 import java.io.IOException;
+import java.util.List;
+
 import org.apache.hadoop.hdfs.protocol.HdfsConstants;
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.classification.InterfaceStability;
@@ -45,7 +47,7 @@ abstract class FSImageStorageInspector {
    * Get the image files which should be loaded into the filesystem.
    * @throws IOException if not enough files are available (eg no image found in any directory)
    */
-  abstract FSImageFile getLatestImage() throws IOException;
+  abstract List<FSImageFile> getLatestImages() throws IOException;
 
   /** 
    * Get the minimum tx id which should be loaded with this set of images.

+ 19 - 11
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSImageTransactionalStorageInspector.java

@@ -22,6 +22,7 @@ import java.io.File;
 import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.util.ArrayList;
+import java.util.LinkedList;
 import java.util.List;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
@@ -108,24 +109,31 @@ class FSImageTransactionalStorageInspector extends FSImageStorageInspector {
   }
   
   /**
-   * @return the image that has the most recent associated transaction ID.
-   * If there are multiple storage directories which contain equal images 
-   * the storage directory that was inspected first will be preferred.
+   * @return the image files that have the most recent associated 
+   * transaction IDs.  If there are multiple storage directories which 
+   * contain equal images, we'll return them all.
    * 
    * @throws FileNotFoundException if not images are found.
    */
   @Override
-  FSImageFile getLatestImage() throws IOException {
-    if (foundImages.isEmpty()) {
-      throw new FileNotFoundException("No valid image files found");
-    }
-
-    FSImageFile ret = null;
+  List<FSImageFile> getLatestImages() throws IOException {
+    LinkedList<FSImageFile> ret = new LinkedList<FSImageFile>();
     for (FSImageFile img : foundImages) {
-      if (ret == null || img.txId > ret.txId) {
-        ret = img;
+      if (ret.isEmpty()) {
+        ret.add(img);
+      } else {
+        FSImageFile cur = ret.getFirst();
+        if (cur.txId == img.txId) {
+          ret.add(img);
+        } else if (cur.txId < img.txId) {
+          ret.clear();
+          ret.add(img);
+        }
       }
     }
+    if (ret.isEmpty()) {
+      throw new FileNotFoundException("No valid image files found");
+    }
     return ret;
   }
   

+ 67 - 8
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java

@@ -34,6 +34,7 @@ import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_ENCRYPT_DATA_TRANSFER_KEY
 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_HA_STANDBY_CHECKPOINTS_DEFAULT;
 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_HA_STANDBY_CHECKPOINTS_KEY;
 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_ACCESSTIME_PRECISION_KEY;
+import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_ACCESSTIME_PRECISION_DEFAULT;
 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_AUDIT_LOGGERS_KEY;
 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DEFAULT_AUDIT_LOGGER_NAME;
 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_KEY_UPDATE_INTERVAL_DEFAULT;
@@ -603,7 +604,8 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
       this.maxFsObjects = conf.getLong(DFS_NAMENODE_MAX_OBJECTS_KEY, 
                                        DFS_NAMENODE_MAX_OBJECTS_DEFAULT);
 
-      this.accessTimePrecision = conf.getLong(DFS_NAMENODE_ACCESSTIME_PRECISION_KEY, 0);
+      this.accessTimePrecision = conf.getLong(DFS_NAMENODE_ACCESSTIME_PRECISION_KEY,
+          DFS_NAMENODE_ACCESSTIME_PRECISION_DEFAULT);
       this.supportAppends = conf.getBoolean(DFS_SUPPORT_APPEND_KEY, DFS_SUPPORT_APPEND_DEFAULT);
       LOG.info("Append Enabled: " + supportAppends);
 
@@ -1137,8 +1139,10 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
    */
   void metaSave(String filename) throws IOException {
     checkSuperuserPrivilege();
+    checkOperation(OperationCategory.UNCHECKED);
     writeLock();
     try {
+      checkOperation(OperationCategory.UNCHECKED);
       File file = new File(System.getProperty("hadoop.log.dir"), filename);
       PrintWriter out = new PrintWriter(new BufferedWriter(
           new OutputStreamWriter(new FileOutputStream(file, true), Charsets.UTF_8)));
@@ -1212,6 +1216,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
       UnresolvedLinkException, IOException {
     HdfsFileStatus resultingStat = null;
     FSPermissionChecker pc = getPermissionChecker();
+    checkOperation(OperationCategory.WRITE);
     writeLock();
     try {
       checkOperation(OperationCategory.WRITE);
@@ -1249,6 +1254,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
       UnresolvedLinkException, IOException {
     HdfsFileStatus resultingStat = null;
     FSPermissionChecker pc = getPermissionChecker();
+    checkOperation(OperationCategory.WRITE);
     writeLock();
     try {
       checkOperation(OperationCategory.WRITE);
@@ -1359,13 +1365,20 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
       throws FileNotFoundException, UnresolvedLinkException, IOException {
 
     for (int attempt = 0; attempt < 2; attempt++) {
-      if (attempt == 0) { // first attempt is with readlock
+      boolean isReadOp = (attempt == 0);
+      if (isReadOp) { // first attempt is with readlock
+        checkOperation(OperationCategory.READ);
         readLock();
       }  else { // second attempt is with  write lock
+        checkOperation(OperationCategory.WRITE);
         writeLock(); // writelock is needed to set accesstime
       }
       try {
-        checkOperation(OperationCategory.READ);
+        if (isReadOp) {
+          checkOperation(OperationCategory.READ);
+        } else {
+          checkOperation(OperationCategory.WRITE);
+        }
 
         // if the namenode is in safemode, then do not update access time
         if (isInSafeMode()) {
@@ -1380,7 +1393,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
           if (now <= inode.getAccessTime() + getAccessTimePrecision()) {
             // if we have to set access time but we only have the readlock, then
             // restart this entire operation with the writeLock.
-            if (attempt == 0) {
+            if (isReadOp) {
               continue;
             }
           }
@@ -1392,7 +1405,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
         return blockManager.createLocatedBlocks(inode.getBlocks(), fileSize,
             inode.isUnderConstruction(), offset, length, needBlockToken);
       } finally {
-        if (attempt == 0) {
+        if (isReadOp) {
           readUnlock();
         } else {
           writeUnlock();
@@ -1448,6 +1461,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
 
     HdfsFileStatus resultingStat = null;
     FSPermissionChecker pc = getPermissionChecker();
+    checkOperation(OperationCategory.WRITE);
     writeLock();
     try {
       checkOperation(OperationCategory.WRITE);
@@ -1595,6 +1609,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
     }
     HdfsFileStatus resultingStat = null;
     FSPermissionChecker pc = getPermissionChecker();
+    checkOperation(OperationCategory.WRITE);
     writeLock();
     try {
       checkOperation(OperationCategory.WRITE);
@@ -1636,6 +1651,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
       throws IOException, UnresolvedLinkException {
     HdfsFileStatus resultingStat = null;
     FSPermissionChecker pc = getPermissionChecker();
+    checkOperation(OperationCategory.WRITE);
     writeLock();
     try {
       checkOperation(OperationCategory.WRITE);
@@ -1711,6 +1727,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
     blockManager.verifyReplication(src, replication, null);
     final boolean isFile;
     FSPermissionChecker pc = getPermissionChecker();
+    checkOperation(OperationCategory.WRITE);
     writeLock();
     try {
       checkOperation(OperationCategory.WRITE);
@@ -1741,6 +1758,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
   long getPreferredBlockSize(String filename) 
       throws IOException, UnresolvedLinkException {
     FSPermissionChecker pc = getPermissionChecker();
+    checkOperation(OperationCategory.READ);
     readLock();
     try {
       checkOperation(OperationCategory.READ);
@@ -1803,6 +1821,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
     boolean skipSync = false;
     final HdfsFileStatus stat;
     FSPermissionChecker pc = getPermissionChecker();
+    checkOperation(OperationCategory.WRITE);
     writeLock();
     try {
       checkOperation(OperationCategory.WRITE);
@@ -1995,6 +2014,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
       throws IOException {
     boolean skipSync = false;
     FSPermissionChecker pc = getPermissionChecker();
+    checkOperation(OperationCategory.WRITE);
     writeLock();
     try {
       checkOperation(OperationCategory.WRITE);
@@ -2132,6 +2152,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
     }
     LocatedBlock lb = null;
     FSPermissionChecker pc = getPermissionChecker();
+    checkOperation(OperationCategory.WRITE);
     writeLock();
     try {
       checkOperation(OperationCategory.WRITE);
@@ -2198,8 +2219,10 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
     }
 
     // Part I. Analyze the state of the file with respect to the input data.
+    checkOperation(OperationCategory.READ);
     readLock();
     try {
+      checkOperation(OperationCategory.READ);
       LocatedBlock[] onRetryBlock = new LocatedBlock[1];
       final INode[] inodes = analyzeFileState(
           src, fileId, clientName, previous, onRetryBlock).getINodes();
@@ -2226,8 +2249,10 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
     // Allocate a new block, add it to the INode and the BlocksMap. 
     Block newBlock = null;
     long offset;
+    checkOperation(OperationCategory.WRITE);
     writeLock();
     try {
+      checkOperation(OperationCategory.WRITE);
       // Run the full analysis again, since things could have changed
       // while chooseTarget() was executing.
       LocatedBlock[] onRetryBlock = new LocatedBlock[1];
@@ -2379,9 +2404,10 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
     final DatanodeDescriptor clientnode;
     final long preferredblocksize;
     final List<DatanodeDescriptor> chosen;
+    checkOperation(OperationCategory.READ);
     readLock();
     try {
-      checkOperation(OperationCategory.WRITE);
+      checkOperation(OperationCategory.READ);
       //check safe mode
       if (isInSafeMode()) {
         throw new SafeModeException("Cannot add datanode; src=" + src
@@ -2421,6 +2447,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
   boolean abandonBlock(ExtendedBlock b, String src, String holder)
       throws LeaseExpiredException, FileNotFoundException,
       UnresolvedLinkException, IOException {
+    checkOperation(OperationCategory.WRITE);
     writeLock();
     try {
       checkOperation(OperationCategory.WRITE);
@@ -2498,6 +2525,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
     throws SafeModeException, UnresolvedLinkException, IOException {
     checkBlock(last);
     boolean success = false;
+    checkOperation(OperationCategory.WRITE);
     writeLock();
     try {
       checkOperation(OperationCategory.WRITE);
@@ -2669,6 +2697,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
           " to " + dst);
     }
     FSPermissionChecker pc = getPermissionChecker();
+    checkOperation(OperationCategory.WRITE);
     writeLock();
     try {
       checkOperation(OperationCategory.WRITE);
@@ -2725,6 +2754,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
           + src + " to " + dst);
     }
     FSPermissionChecker pc = getPermissionChecker();
+    checkOperation(OperationCategory.WRITE);
     writeLock();
     try {
       checkOperation(OperationCategory.WRITE);
@@ -2811,6 +2841,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
              IOException {
     BlocksMapUpdateInfo collectedBlocks = new BlocksMapUpdateInfo();
     FSPermissionChecker pc = getPermissionChecker();
+    checkOperation(OperationCategory.WRITE);
     writeLock();
     try {
       checkOperation(OperationCategory.WRITE);
@@ -2939,6 +2970,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
            StandbyException, IOException {
     HdfsFileStatus stat = null;
     FSPermissionChecker pc = getPermissionChecker();
+    checkOperation(OperationCategory.READ);
     readLock();
     try {
       checkOperation(OperationCategory.READ);
@@ -2981,6 +3013,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
       NameNode.stateChangeLog.debug("DIR* NameSystem.mkdirs: " + src);
     }
     FSPermissionChecker pc = getPermissionChecker();
+    checkOperation(OperationCategory.WRITE);
     writeLock();
     try {
       checkOperation(OperationCategory.WRITE);
@@ -3041,6 +3074,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
       FileNotFoundException, UnresolvedLinkException, StandbyException {
     FSPermissionChecker pc = new FSPermissionChecker(fsOwnerShortUserName,
         supergroup);
+    checkOperation(OperationCategory.READ);
     readLock();
     try {
       checkOperation(OperationCategory.READ);
@@ -3061,6 +3095,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
   void setQuota(String path, long nsQuota, long dsQuota) 
       throws IOException, UnresolvedLinkException {
     checkSuperuserPrivilege();
+    checkOperation(OperationCategory.WRITE);
     writeLock();
     try {
       checkOperation(OperationCategory.WRITE);
@@ -3084,6 +3119,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
   void fsync(String src, String clientName, long lastBlockLength) 
       throws IOException, UnresolvedLinkException {
     NameNode.stateChangeLog.info("BLOCK* fsync: " + src + " for " + clientName);
+    checkOperation(OperationCategory.WRITE);
     writeLock();
     try {
       checkOperation(OperationCategory.WRITE);
@@ -3293,6 +3329,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
       String[] newtargetstorages)
       throws IOException, UnresolvedLinkException {
     String src = "";
+    checkOperation(OperationCategory.WRITE);
     writeLock();
     try {
       checkOperation(OperationCategory.WRITE);
@@ -3397,6 +3434,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
    * Renew the lease(s) held by the given client
    */
   void renewLease(String holder) throws IOException {
+    checkOperation(OperationCategory.WRITE);
     writeLock();
     try {
       checkOperation(OperationCategory.WRITE);
@@ -3438,6 +3476,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
     throws AccessControlException, UnresolvedLinkException, IOException {
     DirectoryListing dl;
     FSPermissionChecker pc = getPermissionChecker();
+    checkOperation(OperationCategory.READ);
     readLock();
     try {
       checkOperation(OperationCategory.READ);
@@ -3734,10 +3773,12 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
   }
 
   DatanodeInfo[] datanodeReport(final DatanodeReportType type
-      ) throws AccessControlException {
+      ) throws AccessControlException, StandbyException {
     checkSuperuserPrivilege();
+    checkOperation(OperationCategory.UNCHECKED);
     readLock();
     try {
+      checkOperation(OperationCategory.UNCHECKED);
       final DatanodeManager dm = getBlockManager().getDatanodeManager();      
       final List<DatanodeDescriptor> results = dm.getDatanodeListForReport(type);
 
@@ -3761,8 +3802,10 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
    */
   void saveNamespace() throws AccessControlException, IOException {
     checkSuperuserPrivilege();
+    checkOperation(OperationCategory.UNCHECKED);
     readLock();
     try {
+      checkOperation(OperationCategory.UNCHECKED);
       if (!isInSafeMode()) {
         throw new IOException("Safe mode should be turned ON " +
                               "in order to create namespace image.");
@@ -3780,10 +3823,13 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
    * 
    * @throws AccessControlException if superuser privilege is violated.
    */
-  boolean restoreFailedStorage(String arg) throws AccessControlException {
+  boolean restoreFailedStorage(String arg) throws AccessControlException,
+      StandbyException {
     checkSuperuserPrivilege();
+    checkOperation(OperationCategory.UNCHECKED);
     writeLock();
     try {
+      checkOperation(OperationCategory.UNCHECKED);
       
       // if it is disabled - enable it and vice versa.
       if(arg.equals("check"))
@@ -3804,6 +3850,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
     
   void finalizeUpgrade() throws IOException {
     checkSuperuserPrivilege();
+    checkOperation(OperationCategory.WRITE);
     writeLock();
     try {
       checkOperation(OperationCategory.WRITE);
@@ -4543,6 +4590,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
 
   CheckpointSignature rollEditLog() throws IOException {
     checkSuperuserPrivilege();
+    checkOperation(OperationCategory.JOURNAL);
     writeLock();
     try {
       checkOperation(OperationCategory.JOURNAL);
@@ -4560,6 +4608,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
                                 NamenodeRegistration bnReg, // backup node
                                 NamenodeRegistration nnReg) // active name-node
   throws IOException {
+    checkOperation(OperationCategory.CHECKPOINT);
     writeLock();
     try {
       checkOperation(OperationCategory.CHECKPOINT);
@@ -4578,6 +4627,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
 
   void endCheckpoint(NamenodeRegistration registration,
                             CheckpointSignature sig) throws IOException {
+    checkOperation(OperationCategory.CHECKPOINT);
     readLock();
     try {
       checkOperation(OperationCategory.CHECKPOINT);
@@ -4866,6 +4916,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
    * Client is reporting some bad block locations.
    */
   void reportBadBlocks(LocatedBlock[] blocks) throws IOException {
+    checkOperation(OperationCategory.WRITE);
     writeLock();
     try {
       checkOperation(OperationCategory.WRITE);
@@ -4900,6 +4951,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
   LocatedBlock updateBlockForPipeline(ExtendedBlock block, 
       String clientName) throws IOException {
     LocatedBlock locatedBlock;
+    checkOperation(OperationCategory.WRITE);
     writeLock();
     try {
       checkOperation(OperationCategory.WRITE);
@@ -4931,6 +4983,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
   void updatePipeline(String clientName, ExtendedBlock oldBlock, 
       ExtendedBlock newBlock, DatanodeID[] newNodes)
       throws IOException {
+    checkOperation(OperationCategory.WRITE);
     writeLock();
     try {
       checkOperation(OperationCategory.WRITE);
@@ -5058,8 +5111,10 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
    */
   void releaseBackupNode(NamenodeRegistration registration)
     throws IOException {
+    checkOperation(OperationCategory.WRITE);
     writeLock();
     try {
+      checkOperation(OperationCategory.WRITE);
       if(getFSImage().getStorage().getNamespaceID()
          != registration.getNamespaceID())
         throw new IOException("Incompatible namespaceIDs: "
@@ -5098,6 +5153,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
   Collection<CorruptFileBlockInfo> listCorruptFileBlocks(String path,
 	String[] cookieTab) throws IOException {
     checkSuperuserPrivilege();
+    checkOperation(OperationCategory.READ);
     readLock();
     try {
       checkOperation(OperationCategory.READ);
@@ -5190,6 +5246,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
   Token<DelegationTokenIdentifier> getDelegationToken(Text renewer)
       throws IOException {
     Token<DelegationTokenIdentifier> token;
+    checkOperation(OperationCategory.WRITE);
     writeLock();
     try {
       checkOperation(OperationCategory.WRITE);
@@ -5236,6 +5293,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
   long renewDelegationToken(Token<DelegationTokenIdentifier> token)
       throws InvalidToken, IOException {
     long expiryTime;
+    checkOperation(OperationCategory.WRITE);
     writeLock();
     try {
       checkOperation(OperationCategory.WRITE);
@@ -5268,6 +5326,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
    */
   void cancelDelegationToken(Token<DelegationTokenIdentifier> token)
       throws IOException {
+    checkOperation(OperationCategory.WRITE);
     writeLock();
     try {
       checkOperation(OperationCategory.WRITE);

+ 0 - 6
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeRpcServer.java

@@ -339,7 +339,6 @@ class NameNodeRpcServer implements NamenodeProtocols {
       throw new IllegalArgumentException(
         "Unexpected not positive size: "+size);
     }
-    namesystem.checkOperation(OperationCategory.READ);
     namesystem.checkSuperuserPrivilege();
     return namesystem.getBlockManager().getBlocks(datanode, size); 
   }
@@ -709,7 +708,6 @@ class NameNodeRpcServer implements NamenodeProtocols {
   @Override // ClientProtocol
   public DatanodeInfo[] getDatanodeReport(DatanodeReportType type)
   throws IOException {
-    namesystem.checkOperation(OperationCategory.UNCHECKED);
     DatanodeInfo results[] = namesystem.datanodeReport(type);
     if (results == null ) {
       throw new IOException("Cannot find datanode report");
@@ -734,19 +732,16 @@ class NameNodeRpcServer implements NamenodeProtocols {
 
   @Override // ClientProtocol
   public boolean restoreFailedStorage(String arg) throws IOException { 
-    namesystem.checkOperation(OperationCategory.UNCHECKED);
     return namesystem.restoreFailedStorage(arg);
   }
 
   @Override // ClientProtocol
   public void saveNamespace() throws IOException {
-    namesystem.checkOperation(OperationCategory.UNCHECKED);
     namesystem.saveNamespace();
   }
   
   @Override // ClientProtocol
   public long rollEdits() throws AccessControlException, IOException {
-    namesystem.checkOperation(OperationCategory.JOURNAL);
     CheckpointSignature sig = namesystem.rollEditLog();
     return sig.getCurSegmentTxId();
   }
@@ -791,7 +786,6 @@ class NameNodeRpcServer implements NamenodeProtocols {
 
   @Override // ClientProtocol
   public void metaSave(String filename) throws IOException {
-    namesystem.checkOperation(OperationCategory.UNCHECKED);
     namesystem.metaSave(filename);
   }
 

+ 4 - 0
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/Namesystem.java

@@ -18,7 +18,9 @@
 package org.apache.hadoop.hdfs.server.namenode;
 
 import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.hdfs.server.namenode.NameNode.OperationCategory;
 import org.apache.hadoop.hdfs.util.RwLock;
+import org.apache.hadoop.ipc.StandbyException;
 import org.apache.hadoop.security.AccessControlException;
 
 /** Namesystem operations. */
@@ -38,4 +40,6 @@ public interface Namesystem extends RwLock, SafeMode {
   public boolean isGenStampInFuture(long generationStamp);
 
   public void adjustSafeModeBlockTotals(int deltaSafe, int deltaTotal);
+
+  public void checkOperation(OperationCategory read) throws StandbyException;
 }

+ 11 - 3
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ha/HAContext.java

@@ -64,9 +64,17 @@ public interface HAContext {
   void writeUnlock();
 
   /**
-   * Verify that the given operation category is allowed in the
-   * current state. This is to allow NN implementations (eg BackupNode)
-   * to override it with node-specific handling.
+   * Verify that the given operation category is allowed in the current state.
+   * This is to allow NN implementations (eg BackupNode) to override it with
+   * node-specific handling.
+   * 
+   * If the operation which is being checked will be taking the FSNS lock, it's
+   * advisable to check the operation category both immediately before and after
+   * taking the lock. This is because clients rely on the StandbyException
+   * thrown by this method in order to trigger client failover, and if a client
+   * first tries to contact the Standby NN, it could block for a long time if
+   * the Standby is holding the lock for a while, e.g. when performing a
+   * checkpoint. See HDFS-4591 for more details.
    */
   void checkOperation(OperationCategory op) throws StandbyException;
 

+ 1 - 1
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/util/LightWeightGSet.java

@@ -72,7 +72,7 @@ public class LightWeightGSet<K, E extends K> implements GSet<K, E> {
   /** Modification version for fail-fast.
    * @see ConcurrentModificationException
    */
-  private volatile int modification = 0;
+  private int modification = 0;
 
   /**
    * @param recommended_length Recommended size of the internal array.

+ 43 - 11
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/web/WebHdfsFileSystem.java

@@ -29,6 +29,7 @@ import java.net.MalformedURLException;
 import java.net.URI;
 import java.net.URISyntaxException;
 import java.net.URL;
+import java.security.PrivilegedExceptionAction;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.List;
@@ -376,17 +377,6 @@ public class WebHdfsFileSystem extends FileSystem
     return url;
   }
 
-  private HttpURLConnection getHttpUrlConnection(URL url)
-      throws IOException, AuthenticationException {
-    final HttpURLConnection conn;
-    if (ugi.hasKerberosCredentials()) { 
-      conn = new AuthenticatedURL(AUTH).openConnection(url, authToken);
-    } else {
-      conn = (HttpURLConnection)url.openConnection();
-    }
-    return conn;
-  }
-
   /**
    * Run a http operation.
    * Connect to the http server, validate response, and obtain the JSON output.
@@ -431,6 +421,48 @@ public class WebHdfsFileSystem extends FileSystem
       this.conn = conn;
     }
 
+    private HttpURLConnection getHttpUrlConnection(final URL url)
+        throws IOException, AuthenticationException {
+      UserGroupInformation connectUgi = ugi.getRealUser();
+      if (connectUgi == null) {
+        connectUgi = ugi;
+      }
+      try {
+        return connectUgi.doAs(
+            new PrivilegedExceptionAction<HttpURLConnection>() {
+              @Override
+              public HttpURLConnection run() throws IOException {
+                return openHttpUrlConnection(url);
+              }
+            });
+      } catch (IOException ioe) {
+        Throwable cause = ioe.getCause();
+        if (cause != null && cause instanceof AuthenticationException) {
+          throw (AuthenticationException)cause;
+        }
+        throw ioe;
+      } catch (InterruptedException e) {
+        throw new IOException(e);
+      }
+    }
+    
+    private HttpURLConnection openHttpUrlConnection(final URL url)
+        throws IOException {
+      final HttpURLConnection conn;
+      try {
+        if (op.getRequireAuth()) {
+          LOG.debug("open AuthenticatedURL connection");
+          conn = new AuthenticatedURL(AUTH).openConnection(url, authToken);
+        } else {
+          LOG.debug("open URL connection");
+          conn = (HttpURLConnection)url.openConnection();
+        }
+      } catch (AuthenticationException e) {
+        throw new IOException(e);
+      }
+      return conn;
+    }
+  
     private void init() throws IOException {
       checkRetry = !redirected;
       try {

+ 64 - 0
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/LogVerificationAppender.java

@@ -0,0 +1,64 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.log4j.AppenderSkeleton;
+import org.apache.log4j.spi.LoggingEvent;
+import org.apache.log4j.spi.ThrowableInformation;
+
+/**
+ * Used to verify that certain exceptions or messages are present in log output.
+ */
+public class LogVerificationAppender extends AppenderSkeleton {
+  private final List<LoggingEvent> log = new ArrayList<LoggingEvent>();
+
+  @Override
+  public boolean requiresLayout() {
+    return false;
+  }
+
+  @Override
+  protected void append(final LoggingEvent loggingEvent) {
+    log.add(loggingEvent);
+  }
+
+  @Override
+  public void close() {
+  }
+
+  public List<LoggingEvent> getLog() {
+    return new ArrayList<LoggingEvent>(log);
+  }
+  
+  public int countExceptionsWithMessage(final String text) {
+    int count = 0;
+    for (LoggingEvent e: getLog()) {
+      ThrowableInformation t = e.getThrowableInformation();
+      if (t != null) {
+        String m = t.getThrowable().getMessage();
+        if (m.contains(text)) {
+          count++;
+        }
+      }
+    }
+    return count;
+  }
+}

+ 10 - 1
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDFSUpgradeFromImage.java

@@ -45,6 +45,7 @@ import org.apache.hadoop.hdfs.protocol.HdfsFileStatus;
 import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.StartupOption;
 import org.apache.hadoop.hdfs.server.namenode.FSImageTestUtil;
 import org.apache.hadoop.util.StringUtils;
+import org.apache.log4j.Logger;
 import org.junit.Test;
 
 /**
@@ -293,6 +294,11 @@ public class TestDFSUpgradeFromImage {
         new File(baseDir, "name2/current/VERSION"),
         "imageMD5Digest", "22222222222222222222222222222222");
     
+    // Attach our own log appender so we can verify output
+    final LogVerificationAppender appender = new LogVerificationAppender();
+    final Logger logger = Logger.getRootLogger();
+    logger.addAppender(appender);
+
     // Upgrade should now fail
     try {
       upgradeAndVerify(new MiniDFSCluster.Builder(upgradeConf).
@@ -300,9 +306,12 @@ public class TestDFSUpgradeFromImage {
       fail("Upgrade did not fail with bad MD5");
     } catch (IOException ioe) {
       String msg = StringUtils.stringifyException(ioe);
-      if (!msg.contains("is corrupt with MD5 checksum")) {
+      if (!msg.contains("Failed to load an FSImage file")) {
         throw ioe;
       }
+      int md5failures = appender.countExceptionsWithMessage(
+          " is corrupt with MD5 checksum of ");
+      assertEquals("Upgrade did not fail with bad MD5", 1, md5failures);
     }
   }
     

+ 3 - 1
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestHftpURLTimeouts.java

@@ -19,6 +19,7 @@
 package org.apache.hadoop.hdfs;
 
 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
 import static org.junit.Assert.assertTrue;
 
 import java.io.IOException;
@@ -39,7 +40,7 @@ import org.junit.Test;
 public class TestHftpURLTimeouts {
   @BeforeClass
   public static void setup() {
-    URLUtils.SOCKET_TIMEOUT = 1;
+    URLUtils.SOCKET_TIMEOUT = 5;
   }
   
   @Test
@@ -116,6 +117,7 @@ public class TestHftpURLTimeouts {
           conns.add(fs.openConnection("/", ""));
         } catch (SocketTimeoutException ste) {
           String message = ste.getMessage();
+          assertNotNull(message);
           // https will get a read timeout due to SSL negotiation, but
           // a normal http will not, so need to ignore SSL read timeouts
           // until a connect timeout occurs

+ 5 - 4
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestMiniDFSCluster.java

@@ -65,7 +65,7 @@ public class TestMiniDFSCluster {
    *
    * @throws Throwable on a failure
    */
-  @Test
+  @Test(timeout=100000)
   public void testClusterWithoutSystemProperties() throws Throwable {
     System.clearProperty(MiniDFSCluster.PROP_TEST_BUILD_DATA);
     Configuration conf = new HdfsConfiguration();
@@ -74,7 +74,8 @@ public class TestMiniDFSCluster {
     conf.set(MiniDFSCluster.HDFS_MINIDFS_BASEDIR, c1Path);
     MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf).build();
     try {
-      assertEquals(c1Path+"/data", cluster.getDataDirectory());
+      assertEquals(new File(c1Path + "/data"),
+          new File(cluster.getDataDirectory()));
     } finally {
       cluster.shutdown();
     }
@@ -84,7 +85,7 @@ public class TestMiniDFSCluster {
    * Bring up two clusters and assert that they are in different directories.
    * @throws Throwable on a failure
    */
-  @Test
+  @Test(timeout=100000)
   public void testDualClusters() throws Throwable {
     File testDataCluster2 = new File(testDataPath, CLUSTER_2);
     File testDataCluster3 = new File(testDataPath, CLUSTER_3);
@@ -95,7 +96,7 @@ public class TestMiniDFSCluster {
     MiniDFSCluster cluster3 = null;
     try {
       String dataDir2 = cluster2.getDataDirectory();
-      assertEquals(c2Path + "/data", dataDir2);
+      assertEquals(new File(c2Path + "/data"), new File(dataDir2));
       //change the data dir
       conf.set(MiniDFSCluster.HDFS_MINIDFS_BASEDIR,
                testDataCluster3.getAbsolutePath());

+ 108 - 56
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestShortCircuitLocalRead.java

@@ -18,9 +18,11 @@
 package org.apache.hadoop.hdfs;
 
 import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.assertFalse;
 
 import java.io.EOFException;
 import java.io.IOException;
+import java.net.URI;
 import java.nio.ByteBuffer;
 import java.security.PrivilegedExceptionAction;
 
@@ -32,6 +34,7 @@ import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.hdfs.client.HdfsDataInputStream;
 import org.apache.hadoop.hdfs.protocol.BlockLocalPathInfo;
 import org.apache.hadoop.hdfs.protocol.ClientDatanodeProtocol;
+import org.apache.hadoop.hdfs.protocol.DatanodeID;
 import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
 import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
 import org.apache.hadoop.hdfs.protocol.LocatedBlocks;
@@ -85,9 +88,20 @@ public class TestShortCircuitLocalRead {
       }
     }
   }
+  
+  private static String getCurrentUser() throws IOException {
+    return UserGroupInformation.getCurrentUser().getShortUserName();
+  }
 
-  static void checkFileContent(FileSystem fs, Path name, byte[] expected,
-      int readOffset) throws IOException {
+  /** Check file content, reading as user {@code readingUser} */
+  static void checkFileContent(URI uri, Path name, byte[] expected,
+      int readOffset, String readingUser, Configuration conf,
+      boolean shortCircuitFails)
+      throws IOException, InterruptedException {
+    // Ensure short circuit is enabled
+    DistributedFileSystem fs = getFileSystem(readingUser, uri, conf);
+    assertTrue(fs.getClient().getShortCircuitLocalReads());
+    
     FSDataInputStream stm = fs.open(name);
     byte[] actual = new byte[expected.length-readOffset];
     stm.readFully(readOffset, actual);
@@ -112,6 +126,11 @@ public class TestShortCircuitLocalRead {
       nread += nbytes;
     }
     checkData(actual, readOffset, expected, "Read 3");
+    
+    if (shortCircuitFails) {
+      // short circuit should be disabled due to failure
+      assertFalse(fs.getClient().getShortCircuitLocalReads());
+    }
     stm.close();
   }
 
@@ -123,11 +142,15 @@ public class TestShortCircuitLocalRead {
     return arr;
   }
   
-  /**
-   * Verifies that reading a file with the direct read(ByteBuffer) api gives the expected set of bytes.
-   */
-  static void checkFileContentDirect(FileSystem fs, Path name, byte[] expected,
-      int readOffset) throws IOException {
+  /** Check the file content, reading as user {@code readingUser} */
+  static void checkFileContentDirect(URI uri, Path name, byte[] expected,
+      int readOffset, String readingUser, Configuration conf,
+      boolean shortCircuitFails)
+      throws IOException, InterruptedException {
+    // Ensure short circuit is enabled
+    DistributedFileSystem fs = getFileSystem(readingUser, uri, conf);
+    assertTrue(fs.getClient().getShortCircuitLocalReads());
+    
     HdfsDataInputStream stm = (HdfsDataInputStream)fs.open(name);
 
     ByteBuffer actual = ByteBuffer.allocateDirect(expected.length - readOffset);
@@ -157,21 +180,33 @@ public class TestShortCircuitLocalRead {
       nread += nbytes;
     }
     checkData(arrayFromByteBuffer(actual), readOffset, expected, "Read 3");
+    if (shortCircuitFails) {
+      // short circuit should be disabled due to failure
+      assertFalse(fs.getClient().getShortCircuitLocalReads());
+    }
     stm.close();
   }
 
+  public void doTestShortCircuitRead(boolean ignoreChecksum, int size,
+      int readOffset) throws IOException, InterruptedException {
+    String shortCircuitUser = getCurrentUser();
+    doTestShortCircuitRead(ignoreChecksum, size, readOffset, shortCircuitUser,
+        shortCircuitUser, false);
+  }
+  
   /**
    * Test that file data can be read by reading the block file
    * directly from the local store.
    */
   public void doTestShortCircuitRead(boolean ignoreChecksum, int size,
-      int readOffset) throws IOException {
+      int readOffset, String shortCircuitUser, String readingUser,
+      boolean shortCircuitFails) throws IOException, InterruptedException {
     Configuration conf = new Configuration();
     conf.setBoolean(DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_KEY, true);
     conf.setBoolean(DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_SKIP_CHECKSUM_KEY,
         ignoreChecksum);
     conf.set(DFSConfigKeys.DFS_BLOCK_LOCAL_PATH_ACCESS_USER_KEY,
-        UserGroupInformation.getCurrentUser().getShortUserName());
+        shortCircuitUser);
     if (simulatedStorage) {
       SimulatedFSDataset.setFactory(conf);
     }
@@ -184,53 +219,88 @@ public class TestShortCircuitLocalRead {
       assertTrue("/ should be a directory", fs.getFileStatus(path)
           .isDirectory() == true);
       
-      byte[] fileData = AppendTestUtil.randomBytes(seed, size);
       // create a new file in home directory. Do not close it.
-      Path file1 = new Path("filelocal.dat");
+      byte[] fileData = AppendTestUtil.randomBytes(seed, size);
+      Path file1 = fs.makeQualified(new Path("filelocal.dat"));
       FSDataOutputStream stm = createFile(fs, file1, 1);
-
-      // write to file
       stm.write(fileData);
       stm.close();
-      checkFileContent(fs, file1, fileData, readOffset);
-      checkFileContentDirect(fs, file1, fileData, readOffset);
+      
+      URI uri = cluster.getURI();
+      checkFileContent(uri, file1, fileData, readOffset, readingUser, conf,
+          shortCircuitFails);
+      checkFileContentDirect(uri, file1, fileData, readOffset, readingUser,
+          conf, shortCircuitFails);
     } finally {
       fs.close();
       cluster.shutdown();
     }
   }
 
-  @Test
-  public void testFileLocalReadNoChecksum() throws IOException {
+  @Test(timeout=10000)
+  public void testFileLocalReadNoChecksum() throws Exception {
     doTestShortCircuitRead(true, 3*blockSize+100, 0);
   }
 
-  @Test
-  public void testFileLocalReadChecksum() throws IOException {
+  @Test(timeout=10000)
+  public void testFileLocalReadChecksum() throws Exception {
     doTestShortCircuitRead(false, 3*blockSize+100, 0);
   }
   
-  @Test
-  public void testSmallFileLocalRead() throws IOException {
+  @Test(timeout=10000)
+  public void testSmallFileLocalRead() throws Exception {
     doTestShortCircuitRead(false, 13, 0);
     doTestShortCircuitRead(false, 13, 5);
     doTestShortCircuitRead(true, 13, 0);
     doTestShortCircuitRead(true, 13, 5);
   }
   
-  @Test
-  public void testReadFromAnOffset() throws IOException {
+  /**
+   * Try a short circuit from a reader that is not allowed to
+   * to use short circuit. The test ensures reader falls back to non
+   * shortcircuit reads when shortcircuit is disallowed.
+   */
+  @Test(timeout=10000)
+  public void testLocalReadFallback() throws Exception {
+    doTestShortCircuitRead(true, 13, 0, getCurrentUser(), "notallowed", true);
+  }
+  
+  @Test(timeout=10000)
+  public void testReadFromAnOffset() throws Exception {
     doTestShortCircuitRead(false, 3*blockSize+100, 777);
     doTestShortCircuitRead(true, 3*blockSize+100, 777);
   }
   
-  @Test
-  public void testLongFile() throws IOException {
+  @Test(timeout=10000)
+  public void testLongFile() throws Exception {
     doTestShortCircuitRead(false, 10*blockSize+100, 777);
     doTestShortCircuitRead(true, 10*blockSize+100, 777);
   }
    
-  @Test
+  private ClientDatanodeProtocol getProxy(UserGroupInformation ugi,
+      final DatanodeID dnInfo, final Configuration conf) throws IOException,
+      InterruptedException {
+    return ugi.doAs(new PrivilegedExceptionAction<ClientDatanodeProtocol>() {
+      @Override
+      public ClientDatanodeProtocol run() throws Exception {
+        return DFSUtil.createClientDatanodeProtocolProxy(dnInfo, conf, 60000,
+            false);
+      }
+    });
+  }
+  
+  private static DistributedFileSystem getFileSystem(String user, final URI uri,
+      final Configuration conf) throws InterruptedException, IOException {
+    UserGroupInformation ugi = UserGroupInformation.createRemoteUser(user);
+    return ugi.doAs(new PrivilegedExceptionAction<DistributedFileSystem>() {
+      @Override
+      public DistributedFileSystem run() throws Exception {
+        return (DistributedFileSystem)FileSystem.get(uri, conf);
+      }
+    });
+  }
+  
+  @Test(timeout=10000)
   public void testGetBlockLocalPathInfo() throws IOException, InterruptedException {
     final Configuration conf = new Configuration();
     conf.set(DFSConfigKeys.DFS_BLOCK_LOCAL_PATH_ACCESS_USER_KEY,
@@ -253,15 +323,7 @@ public class TestShortCircuitLocalRead {
       ExtendedBlock blk = new ExtendedBlock(lb.get(0).getBlock());
       Token<BlockTokenIdentifier> token = lb.get(0).getBlockToken();
       final DatanodeInfo dnInfo = lb.get(0).getLocations()[0];
-      ClientDatanodeProtocol proxy = aUgi1
-          .doAs(new PrivilegedExceptionAction<ClientDatanodeProtocol>() {
-            @Override
-            public ClientDatanodeProtocol run() throws Exception {
-              return DFSUtil.createClientDatanodeProtocolProxy(dnInfo, conf,
-                  60000, false);
-            }
-          });
-      
+      ClientDatanodeProtocol proxy = getProxy(aUgi1, dnInfo, conf);
       // This should succeed
       BlockLocalPathInfo blpi = proxy.getBlockLocalPathInfo(blk, token);
       Assert.assertEquals(
@@ -269,14 +331,7 @@ public class TestShortCircuitLocalRead {
           blpi.getBlockPath());
 
       // Try with the other allowed user
-      proxy = aUgi2
-          .doAs(new PrivilegedExceptionAction<ClientDatanodeProtocol>() {
-            @Override
-            public ClientDatanodeProtocol run() throws Exception {
-              return DFSUtil.createClientDatanodeProtocolProxy(dnInfo, conf,
-                  60000, false);
-            }
-          });
+      proxy = getProxy(aUgi2, dnInfo, conf);
 
       // This should succeed as well
       blpi = proxy.getBlockLocalPathInfo(blk, token);
@@ -287,14 +342,7 @@ public class TestShortCircuitLocalRead {
       // Now try with a disallowed user
       UserGroupInformation bUgi = UserGroupInformation
           .createRemoteUser("notalloweduser");
-      proxy = bUgi
-          .doAs(new PrivilegedExceptionAction<ClientDatanodeProtocol>() {
-            @Override
-            public ClientDatanodeProtocol run() throws Exception {
-              return DFSUtil.createClientDatanodeProtocolProxy(dnInfo, conf,
-                  60000, false);
-            }
-          });
+      proxy = getProxy(bUgi, dnInfo, conf);
       try {
         proxy.getBlockLocalPathInfo(blk, token);
         Assert.fail("The call should have failed as " + bUgi.getShortUserName()
@@ -309,14 +357,14 @@ public class TestShortCircuitLocalRead {
     }
   }
 
-  @Test
+  @Test(timeout=10000)
   public void testSkipWithVerifyChecksum() throws IOException {
     int size = blockSize;
     Configuration conf = new Configuration();
     conf.setBoolean(DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_KEY, true);
     conf.setBoolean(DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_SKIP_CHECKSUM_KEY, false);
     conf.set(DFSConfigKeys.DFS_BLOCK_LOCAL_PATH_ACCESS_USER_KEY,
-        UserGroupInformation.getCurrentUser().getShortUserName());
+        getCurrentUser());
     if (simulatedStorage) {
       SimulatedFSDataset.setFactory(conf);
     }
@@ -356,7 +404,7 @@ public class TestShortCircuitLocalRead {
   }
      
   /**
-   * Test to run benchmarks between shortcircuit read vs regular read with
+   * Test to run benchmarks between short circuit read vs regular read with
    * specified number of threads simultaneously reading.
    * <br>
    * Run this using the following command:
@@ -374,7 +422,7 @@ public class TestShortCircuitLocalRead {
     int threadCount = Integer.valueOf(args[2]);
 
     // Setup create a file
-    Configuration conf = new Configuration();
+    final Configuration conf = new Configuration();
     conf.setBoolean(DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_KEY, shortcircuit);
     conf.setBoolean(DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_SKIP_CHECKSUM_KEY,
         checksum);
@@ -400,9 +448,13 @@ public class TestShortCircuitLocalRead {
         public void run() {
           for (int i = 0; i < iteration; i++) {
             try {
-              checkFileContent(fs, file1, dataToWrite, 0);
+              String user = getCurrentUser();
+              checkFileContent(fs.getUri(), file1, dataToWrite, 0, user, conf,
+                  true);
             } catch (IOException e) {
               e.printStackTrace();
+            } catch (InterruptedException e) {
+              e.printStackTrace();
             }
           }
         }

+ 20 - 10
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/qjournal/server/TestJournalNode.java

@@ -46,6 +46,7 @@ import org.apache.hadoop.metrics2.MetricsRecordBuilder;
 import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
 import org.apache.hadoop.test.GenericTestUtils;
 import org.apache.hadoop.test.MetricsAsserts;
+import org.apache.hadoop.util.Shell;
 import org.junit.After;
 import org.junit.Before;
 import org.junit.Test;
@@ -65,6 +66,8 @@ public class TestJournalNode {
   private Configuration conf = new Configuration();
   private IPCLoggerChannel ch;
   private String journalId;
+  private File TEST_BUILD_DATA =
+      new File(System.getProperty("test.build.data", "build/test/data"));
 
   static {
     // Avoid an error when we double-initialize JvmMetrics
@@ -96,7 +99,7 @@ public class TestJournalNode {
     jn.stop(0);
   }
   
-  @Test
+  @Test(timeout=100000)
   public void testJournal() throws Exception {
     MetricsRecordBuilder metrics = MetricsAsserts.getMetrics(
         journal.getMetricsForTests().getName());
@@ -129,7 +132,7 @@ public class TestJournalNode {
   }
   
   
-  @Test
+  @Test(timeout=100000)
   public void testReturnsSegmentInfoAtEpochTransition() throws Exception {
     ch.newEpoch(1).get();
     ch.setEpoch(1);
@@ -157,7 +160,7 @@ public class TestJournalNode {
     assertEquals(1, response.getLastSegmentTxId());
   }
   
-  @Test
+  @Test(timeout=100000)
   public void testHttpServer() throws Exception {
     InetSocketAddress addr = jn.getBoundHttpAddress();
     assertTrue(addr.getPort() > 0);
@@ -210,7 +213,7 @@ public class TestJournalNode {
    * Test that the JournalNode performs correctly as a Paxos
    * <em>Acceptor</em> process.
    */
-  @Test
+  @Test(timeout=100000)
   public void testAcceptRecoveryBehavior() throws Exception {
     // We need to run newEpoch() first, or else we have no way to distinguish
     // different proposals for the same decision.
@@ -270,20 +273,27 @@ public class TestJournalNode {
     }
   }
   
-  @Test
+  @Test(timeout=100000)
   public void testFailToStartWithBadConfig() throws Exception {
     Configuration conf = new Configuration();
     conf.set(DFSConfigKeys.DFS_JOURNALNODE_EDITS_DIR_KEY, "non-absolute-path");
     assertJNFailsToStart(conf, "should be an absolute path");
     
     // Existing file which is not a directory 
-    conf.set(DFSConfigKeys.DFS_JOURNALNODE_EDITS_DIR_KEY, "/dev/null");
-    assertJNFailsToStart(conf, "is not a directory");
+    File existingFile = new File(TEST_BUILD_DATA, "testjournalnodefile");
+    assertTrue(existingFile.createNewFile());
+    try {
+      conf.set(DFSConfigKeys.DFS_JOURNALNODE_EDITS_DIR_KEY,
+          existingFile.getAbsolutePath());
+      assertJNFailsToStart(conf, "Not a directory");
+    } finally {
+      existingFile.delete();
+    }
     
     // Directory which cannot be created
-    conf.set(DFSConfigKeys.DFS_JOURNALNODE_EDITS_DIR_KEY, "/proc/does-not-exist");
-    assertJNFailsToStart(conf, "Could not create");
-
+    conf.set(DFSConfigKeys.DFS_JOURNALNODE_EDITS_DIR_KEY,
+        Shell.WINDOWS ? "\\\\cannotBeCreated" : "/proc/does-not-exist");
+    assertJNFailsToStart(conf, "Can not create directory");
   }
 
   private static void assertJNFailsToStart(Configuration conf,

+ 1 - 1
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestNodeCount.java

@@ -104,7 +104,7 @@ public class TestNodeCount {
       while (iter.hasNext()) {
         DatanodeDescriptor dn = iter.next();
         Collection<Block> blocks = bm.excessReplicateMap.get(dn.getStorageID());
-        if (blocks == null || !blocks.contains(block) ) {
+        if (blocks == null || !blocks.contains(block.getLocalBlock()) ) {
           nonExcessDN = dn;
           break;
         }

+ 2 - 24
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestReplicationPolicy.java

@@ -36,6 +36,7 @@ import org.apache.hadoop.hdfs.DFSConfigKeys;
 import org.apache.hadoop.hdfs.DFSTestUtil;
 import org.apache.hadoop.hdfs.DFSUtil;
 import org.apache.hadoop.hdfs.HdfsConfiguration;
+import org.apache.hadoop.hdfs.LogVerificationAppender;
 import org.apache.hadoop.hdfs.MiniDFSCluster;
 import org.apache.hadoop.hdfs.protocol.Block;
 import org.apache.hadoop.hdfs.protocol.HdfsConstants;
@@ -45,7 +46,6 @@ import org.apache.hadoop.hdfs.server.namenode.NameNode;
 import org.apache.hadoop.net.NetworkTopology;
 import org.apache.hadoop.net.Node;
 import org.apache.hadoop.util.Time;
-import org.apache.log4j.AppenderSkeleton;
 import org.apache.log4j.Level;
 import org.apache.log4j.Logger;
 import org.apache.log4j.spi.LoggingEvent;
@@ -419,7 +419,7 @@ public class TestReplicationPolicy {
           (HdfsConstants.MIN_BLOCKS_FOR_WRITE-1)*BLOCK_SIZE, 0L, 0, 0);
     }
     
-    final TestAppender appender = new TestAppender();
+    final LogVerificationAppender appender = new LogVerificationAppender();
     final Logger logger = Logger.getRootLogger();
     logger.addAppender(appender);
     
@@ -446,28 +446,6 @@ public class TestReplicationPolicy {
           HdfsConstants.MIN_BLOCKS_FOR_WRITE*BLOCK_SIZE, 0L, 0, 0);
     }
   }
-  
-  class TestAppender extends AppenderSkeleton {
-    private final List<LoggingEvent> log = new ArrayList<LoggingEvent>();
-
-    @Override
-    public boolean requiresLayout() {
-      return false;
-    }
-
-    @Override
-    protected void append(final LoggingEvent loggingEvent) {
-      log.add(loggingEvent);
-    }
-
-    @Override
-    public void close() {
-    }
-
-    public List<LoggingEvent> getLog() {
-      return new ArrayList<LoggingEvent>(log);
-    }
-  }
 
   private boolean containsWithinRange(DatanodeDescriptor target,
       DatanodeDescriptor[] nodes, int startIndex, int endIndex) {

+ 12 - 19
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataDirs.java

@@ -27,33 +27,26 @@ import java.util.List;
 import org.junit.Test;
 import static org.junit.Assert.*;
 import static org.mockito.Mockito.*;
-import static org.apache.hadoop.test.MockitoMaker.*;
 
-import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.LocalFileSystem;
 import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.fs.permission.FsPermission;
+import org.apache.hadoop.hdfs.server.datanode.DataNode.DataNodeDiskChecker;
 
 public class TestDataDirs {
 
-  @Test public void testGetDataDirsFromURIs() throws Throwable {
-    File localDir = make(stub(File.class).returning(true).from.exists());
-    when(localDir.mkdir()).thenReturn(true);
-    FsPermission normalPerm = new FsPermission("700");
-    FsPermission badPerm = new FsPermission("000");
-    FileStatus stat = make(stub(FileStatus.class)
-        .returning(normalPerm, normalPerm, badPerm).from.getPermission());
-    when(stat.isDirectory()).thenReturn(true);
-    LocalFileSystem fs = make(stub(LocalFileSystem.class)
-        .returning(stat).from.getFileStatus(any(Path.class)));
-    when(fs.pathToFile(any(Path.class))).thenReturn(localDir);
+  @Test (timeout = 10000)
+  public void testGetDataDirsFromURIs() throws Throwable {
+    
+    DataNodeDiskChecker diskChecker = mock(DataNodeDiskChecker.class);
+    doThrow(new IOException()).doThrow(new IOException()).doNothing()
+      .when(diskChecker).checkDir(any(LocalFileSystem.class), any(Path.class));
+    LocalFileSystem fs = mock(LocalFileSystem.class);
     Collection<URI> uris = Arrays.asList(new URI("file:/p1/"),
         new URI("file:/p2/"), new URI("file:/p3/"));
 
-    List<File> dirs = DataNode.getDataDirsFromURIs(uris, fs, normalPerm);
-
-    verify(fs, times(2)).setPermission(any(Path.class), eq(normalPerm));
-    verify(fs, times(6)).getFileStatus(any(Path.class));
-    assertEquals("number of valid data dirs", dirs.size(), 1);
+    List<File> dirs = DataNode.getDataDirsFromURIs(uris, fs, diskChecker);
+    assertEquals("number of valid data dirs", 1, dirs.size());
+    String validDir = dirs.iterator().next().getPath();
+    assertEquals("p3 should be valid", new File("/p3").getPath(), validDir);
   }
 }

+ 7 - 7
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/FSImageTestUtil.java

@@ -274,15 +274,15 @@ public abstract class FSImageTestUtil {
     for (File dir : dirs) {
       FSImageTransactionalStorageInspector inspector =
         inspectStorageDirectory(dir, NameNodeDirType.IMAGE);
-      FSImageFile latestImage = inspector.getLatestImage();
-      assertNotNull("No image in " + dir, latestImage);      
-      long thisTxId = latestImage.getCheckpointTxId();
+      List<FSImageFile> latestImages = inspector.getLatestImages();
+      assert(!latestImages.isEmpty());
+      long thisTxId = latestImages.get(0).getCheckpointTxId();
       if (imageTxId != -1 && thisTxId != imageTxId) {
         fail("Storage directory " + dir + " does not have the same " +
             "last image index " + imageTxId + " as another");
       }
       imageTxId = thisTxId;
-      imageFiles.add(inspector.getLatestImage().getFile());
+      imageFiles.add(inspector.getLatestImages().get(0).getFile());
     }
     
     assertFileContentsSame(imageFiles.toArray(new File[0]));
@@ -426,7 +426,7 @@ public abstract class FSImageTestUtil {
       new FSImageTransactionalStorageInspector();
     inspector.inspectDirectory(sd);
     
-    return inspector.getLatestImage().getFile();
+    return inspector.getLatestImages().get(0).getFile();
   }
 
   /**
@@ -441,8 +441,8 @@ public abstract class FSImageTestUtil {
       new FSImageTransactionalStorageInspector();
     inspector.inspectDirectory(sd);
 
-    FSImageFile latestImage = inspector.getLatestImage();
-    return (latestImage == null) ? null : latestImage.getFile();
+    List<FSImageFile> latestImages = inspector.getLatestImages();
+    return (latestImages.isEmpty()) ? null : latestImages.get(0).getFile();
   }
 
   /**

+ 55 - 1
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestCheckpoint.java

@@ -231,7 +231,7 @@ public class TestCheckpoint {
   /*
    * Simulate exception during edit replay.
    */
-  @Test(timeout=5000)
+  @Test(timeout=30000)
   public void testReloadOnEditReplayFailure () throws IOException {
     Configuration conf = new HdfsConfiguration();
     FSDataOutputStream fos = null;
@@ -1411,6 +1411,60 @@ public class TestCheckpoint {
     }
   }
   
+  /**
+   * Test NN restart if a failure happens in between creating the fsimage
+   * MD5 file and renaming the fsimage.
+   */
+  @Test(timeout=30000)
+  public void testFailureBeforeRename () throws IOException {
+    Configuration conf = new HdfsConfiguration();
+    FSDataOutputStream fos = null;
+    SecondaryNameNode secondary = null;
+    MiniDFSCluster cluster = null;
+    FileSystem fs = null;
+    NameNode namenode = null;
+
+    try {
+      cluster = new MiniDFSCluster.Builder(conf).numDataNodes(numDatanodes)
+          .build();
+      cluster.waitActive();
+      namenode = cluster.getNameNode();
+      fs = cluster.getFileSystem();
+      secondary = startSecondaryNameNode(conf);
+      fos = fs.create(new Path("tmpfile0"));
+      fos.write(new byte[] { 0, 1, 2, 3 });
+      secondary.doCheckpoint();
+      fos.write(new byte[] { 0, 1, 2, 3 });
+      fos.hsync();
+
+      // Cause merge to fail in next checkpoint.
+      Mockito.doThrow(new IOException(
+          "Injecting failure after MD5Rename"))
+          .when(faultInjector).afterMD5Rename();
+
+      try {
+        secondary.doCheckpoint();
+        fail("Fault injection failed.");
+      } catch (IOException ioe) {
+        // This is expected.
+      }
+      Mockito.reset(faultInjector);
+      // Namenode should still restart successfully
+      cluster.restartNameNode();
+    } finally {
+      if (secondary != null) {
+        secondary.shutdown();
+      }
+      if (fs != null) {
+        fs.close();
+      }
+      if (cluster != null) {
+        cluster.shutdown();
+      }
+      Mockito.reset(faultInjector);
+    }
+  }
+
   /**
    * Test case where two secondary namenodes are checkpointing the same
    * NameNode. This differs from {@link #testMultipleSecondaryNamenodes()}

+ 1 - 1
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestFSImageStorageInspector.java

@@ -57,7 +57,7 @@ public class TestFSImageStorageInspector {
     inspector.inspectDirectory(mockDir);
     assertEquals(2, inspector.foundImages.size());
 
-    FSImageFile latestImage = inspector.getLatestImage();
+    FSImageFile latestImage = inspector.getLatestImages().get(0);
     assertEquals(456, latestImage.txId);
     assertSame(mockDir, latestImage.sd);
     assertTrue(inspector.isUpgradeFinalized());

+ 4 - 3
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestHostsFiles.java

@@ -120,12 +120,13 @@ public class TestHostsFiles {
       
       InetSocketAddress nnHttpAddress = cluster.getNameNode().getHttpAddress();
       LOG.info("nnaddr = '" + nnHttpAddress + "'");
-      URL nnjsp = new URL("http://" + nnHttpAddress.getHostName() + ":" + nnHttpAddress.getPort() + "/dfshealth.jsp");
+      String nnHostName = nnHttpAddress.getHostName();
+      URL nnjsp = new URL("http://" + nnHostName + ":" + nnHttpAddress.getPort() + "/dfshealth.jsp");
       LOG.info("fetching " + nnjsp);
       String dfshealthPage = StringEscapeUtils.unescapeHtml(DFSTestUtil.urlGet(nnjsp));
       LOG.info("got " + dfshealthPage);
-      assertTrue("dfshealth should contain localhost, got:" + dfshealthPage,
-          dfshealthPage.contains("localhost"));
+      assertTrue("dfshealth should contain " + nnHostName + ", got:" + dfshealthPage,
+          dfshealthPage.contains(nnHostName));
 
     } finally {
       cluster.shutdown();

+ 9 - 4
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestProcessCorruptBlocks.java

@@ -158,7 +158,7 @@ public class TestProcessCorruptBlocks {
    *     (corrupt replica should  be removed since number of good
    *      replicas (1) is equal to replication factor (1))
    */
-  @Test
+  @Test(timeout=20000)
   public void testWithReplicationFactorAsOne() throws Exception {
     Configuration conf = new HdfsConfiguration();
     conf.setLong(DFSConfigKeys.DFS_BLOCKREPORT_INTERVAL_MSEC_KEY, 1000L);
@@ -183,9 +183,14 @@ public class TestProcessCorruptBlocks {
       namesystem.setReplication(fileName.toString(), (short) 1);
 
       // wait for 3 seconds so that all block reports are processed.
-      try {
-        Thread.sleep(3000);
-      } catch (InterruptedException ignored) {
+      for (int i = 0; i < 10; i++) {
+        try {
+          Thread.sleep(1000);
+        } catch (InterruptedException ignored) {
+        }
+        if (countReplicas(namesystem, block).corruptReplicas() == 0) {
+          break;
+        }
       }
 
       assertEquals(1, countReplicas(namesystem, block).liveReplicas());

+ 21 - 21
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestSaveNamespace.java

@@ -41,6 +41,7 @@ import org.apache.commons.logging.impl.Log4JLogger;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.permission.FsAction;
 import org.apache.hadoop.fs.permission.FsPermission;
 import org.apache.hadoop.fs.permission.PermissionStatus;
 import org.apache.hadoop.hdfs.DFSConfigKeys;
@@ -219,7 +220,7 @@ public class TestSaveNamespace {
    * Verify that a saveNamespace command brings faulty directories
    * in fs.name.dir and fs.edit.dir back online.
    */
-  @Test
+  @Test (timeout=30000)
   public void testReinsertnamedirsInSavenamespace() throws Exception {
     // create a configuration with the key to restore error
     // directories in fs.name.dir
@@ -237,10 +238,13 @@ public class TestSaveNamespace {
     FSImage spyImage = spy(originalImage);
     fsn.dir.fsImage = spyImage;
     
+    FileSystem fs = FileSystem.getLocal(conf);
     File rootDir = storage.getStorageDir(0).getRoot();
-    rootDir.setExecutable(false);
-    rootDir.setWritable(false);
-    rootDir.setReadable(false);
+    Path rootPath = new Path(rootDir.getPath(), "current");
+    final FsPermission permissionNone = new FsPermission((short) 0);
+    final FsPermission permissionAll = new FsPermission(
+        FsAction.ALL, FsAction.READ_EXECUTE, FsAction.READ_EXECUTE);
+    fs.setPermission(rootPath, permissionNone);
 
     try {
       doAnEdit(fsn, 1);
@@ -257,9 +261,7 @@ public class TestSaveNamespace {
                  " bad directories.", 
                    storage.getRemovedStorageDirs().size() == 1);
 
-      rootDir.setExecutable(true);
-      rootDir.setWritable(true);
-      rootDir.setReadable(true);
+      fs.setPermission(rootPath, permissionAll);
 
       // The next call to savenamespace should try inserting the
       // erroneous directory back to fs.name.dir. This command should
@@ -290,9 +292,7 @@ public class TestSaveNamespace {
       LOG.info("Reloaded image is good.");
     } finally {
       if (rootDir.exists()) {
-        rootDir.setExecutable(true);
-        rootDir.setWritable(true);
-        rootDir.setReadable(true);
+        fs.setPermission(rootPath, permissionAll);
       }
 
       if (fsn != null) {
@@ -305,27 +305,27 @@ public class TestSaveNamespace {
     }
   }
 
-  @Test
+  @Test (timeout=30000)
   public void testRTEWhileSavingSecondImage() throws Exception {
     saveNamespaceWithInjectedFault(Fault.SAVE_SECOND_FSIMAGE_RTE);
   }
 
-  @Test
+  @Test (timeout=30000)
   public void testIOEWhileSavingSecondImage() throws Exception {
     saveNamespaceWithInjectedFault(Fault.SAVE_SECOND_FSIMAGE_IOE);
   }
 
-  @Test
+  @Test (timeout=30000)
   public void testCrashInAllImageDirs() throws Exception {
     saveNamespaceWithInjectedFault(Fault.SAVE_ALL_FSIMAGES);
   }
   
-  @Test
+  @Test (timeout=30000)
   public void testCrashWhenWritingVersionFiles() throws Exception {
     saveNamespaceWithInjectedFault(Fault.WRITE_STORAGE_ALL);
   }
   
-  @Test
+  @Test (timeout=30000)
   public void testCrashWhenWritingVersionFileInOneDir() throws Exception {
     saveNamespaceWithInjectedFault(Fault.WRITE_STORAGE_ONE);
   }
@@ -337,7 +337,7 @@ public class TestSaveNamespace {
    * failed checkpoint since it only affected ".ckpt" files, not
    * valid image files
    */
-  @Test
+  @Test (timeout=30000)
   public void testFailedSaveNamespace() throws Exception {
     doTestFailedSaveNamespace(false);
   }
@@ -347,7 +347,7 @@ public class TestSaveNamespace {
    * the operator restores the directories and calls it again.
    * This should leave the NN in a clean state for next start.
    */
-  @Test
+  @Test (timeout=30000)
   public void testFailedSaveNamespaceWithRecovery() throws Exception {
     doTestFailedSaveNamespace(true);
   }
@@ -421,7 +421,7 @@ public class TestSaveNamespace {
     }
   }
 
-  @Test
+  @Test (timeout=30000)
   public void testSaveWhileEditsRolled() throws Exception {
     Configuration conf = getConf();
     NameNode.initMetrics(conf, NamenodeRole.NAMENODE);
@@ -457,7 +457,7 @@ public class TestSaveNamespace {
     }
   }
   
-  @Test
+  @Test (timeout=30000)
   public void testTxIdPersistence() throws Exception {
     Configuration conf = getConf();
     NameNode.initMetrics(conf, NamenodeRole.NAMENODE);
@@ -580,7 +580,7 @@ public class TestSaveNamespace {
    * open lease and destination directory exist. 
    * This test is a regression for HDFS-2827
    */
-  @Test
+  @Test (timeout=30000)
   public void testSaveNamespaceWithRenamedLease() throws Exception {
     MiniDFSCluster cluster = new MiniDFSCluster.Builder(new Configuration())
         .numDataNodes(1).build();
@@ -603,7 +603,7 @@ public class TestSaveNamespace {
     }
   }
   
-  @Test
+  @Test (timeout=30000)
   public void testSaveNamespaceWithDanglingLease() throws Exception {
     MiniDFSCluster cluster = new MiniDFSCluster.Builder(new Configuration())
         .numDataNodes(1).build();

+ 83 - 32
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestStartup.java

@@ -31,12 +31,10 @@ import java.net.URI;
 import java.util.ArrayList;
 import java.util.Iterator;
 import java.util.List;
-import java.util.Random;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.CommonConfigurationKeys;
 import org.apache.hadoop.fs.FSDataOutputStream;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.FileUtil;
@@ -46,17 +44,21 @@ import org.apache.hadoop.fs.permission.PermissionStatus;
 import org.apache.hadoop.hdfs.DFSConfigKeys;
 import org.apache.hadoop.hdfs.DFSTestUtil;
 import org.apache.hadoop.hdfs.HdfsConfiguration;
+import org.apache.hadoop.hdfs.LogVerificationAppender;
 import org.apache.hadoop.hdfs.MiniDFSCluster;
 import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
 import org.apache.hadoop.hdfs.protocol.HdfsConstants.DatanodeReportType;
 import org.apache.hadoop.hdfs.protocol.HdfsConstants.SafeModeAction;
+import org.apache.hadoop.hdfs.server.common.Storage;
 import org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory;
 import org.apache.hadoop.hdfs.server.namenode.NNStorage.NameNodeDirType;
 import org.apache.hadoop.hdfs.server.namenode.NNStorage.NameNodeFile;
 import org.apache.hadoop.hdfs.server.protocol.NamenodeProtocols;
 import org.apache.hadoop.hdfs.util.MD5FileUtils;
 import org.apache.hadoop.io.MD5Hash;
+import org.apache.hadoop.test.GenericTestUtils;
 import org.apache.hadoop.util.StringUtils;
+import org.apache.log4j.Logger;
 import org.junit.After;
 import org.junit.Before;
 import org.junit.Test;
@@ -111,11 +113,12 @@ public class TestStartup {
     }	
   }
 
-   /**
-   * start MiniDFScluster, create a file (to create edits) and do a checkpoint  
+  /**
+   * Create a number of fsimage checkpoints
+   * @param count number of checkpoints to create
    * @throws IOException
    */
-  public void createCheckPoint() throws IOException {
+  public void createCheckPoint(int count) throws IOException {
     LOG.info("--starting mini cluster");
     // manage dirs parameter set to false 
     MiniDFSCluster cluster = null;
@@ -133,15 +136,18 @@ public class TestStartup {
       sn = new SecondaryNameNode(config);
       assertNotNull(sn);
 
-      // create a file
-      FileSystem fileSys = cluster.getFileSystem();
-      Path file1 = new Path("t1");
-      DFSTestUtil.createFile(fileSys, file1, fileSize, fileSize, blockSize, 
-          (short) 1, seed);
-
-      LOG.info("--doing checkpoint");
-      sn.doCheckpoint();  // this shouldn't fail
-      LOG.info("--done checkpoint");
+      // Create count new files and checkpoints
+      for (int i=0; i<count; i++) {
+        // create a file
+        FileSystem fileSys = cluster.getFileSystem();
+        Path p = new Path("t" + i);
+        DFSTestUtil.createFile(fileSys, p, fileSize, fileSize,
+            blockSize, (short) 1, seed);
+        LOG.info("--file " + p.toString() + " created");
+        LOG.info("--doing checkpoint");
+        sn.doCheckpoint();  // this shouldn't fail
+        LOG.info("--done checkpoint");
+      }
     } catch (IOException e) {
       fail(StringUtils.stringifyException(e));
       System.err.println("checkpoint failed");
@@ -151,7 +157,36 @@ public class TestStartup {
         sn.shutdown();
       if(cluster!=null) 
         cluster.shutdown();
-      LOG.info("--file t1 created, cluster shutdown");
+      LOG.info("--cluster shutdown");
+    }
+  }
+
+  /**
+   * Corrupts the MD5 sum of the fsimage.
+   * 
+   * @param corruptAll
+   *          whether to corrupt one or all of the MD5 sums in the configured
+   *          namedirs
+   * @throws IOException
+   */
+  private void corruptFSImageMD5(boolean corruptAll) throws IOException {
+    List<URI> nameDirs = (List<URI>)FSNamesystem.getNamespaceDirs(config);
+    // Corrupt the md5 files in all the namedirs
+    for (URI uri: nameDirs) {
+      // Directory layout looks like:
+      // test/data/dfs/nameN/current/{fsimage,edits,...}
+      File nameDir = new File(uri.getPath());
+      File dfsDir = nameDir.getParentFile();
+      assertEquals(dfsDir.getName(), "dfs"); // make sure we got right dir
+      // Set the md5 file to all zeros
+      File imageFile = new File(nameDir,
+          Storage.STORAGE_DIR_CURRENT + "/"
+          + NNStorage.getImageFileName(0));
+      MD5FileUtils.saveMD5File(imageFile, new MD5Hash(new byte[16]));
+      // Only need to corrupt one if !corruptAll
+      if (!corruptAll) {
+        break;
+      }
     }
   }
 
@@ -165,7 +200,7 @@ public class TestStartup {
 
     // get name dir and its length, then delete and recreate the directory
     File dir = new File(nameDirs.get(0).getPath()); // has only one
-    this.fsimageLength = new File(new File(dir, "current"), 
+    this.fsimageLength = new File(new File(dir, Storage.STORAGE_DIR_CURRENT), 
         NameNodeFile.IMAGE.getName()).length();
 
     if(dir.exists() && !(FileUtil.fullyDelete(dir)))
@@ -178,7 +213,7 @@ public class TestStartup {
 
     dir = new File( nameEditsDirs.get(0).getPath()); //has only one
 
-    this.editsLength = new File(new File(dir, "current"), 
+    this.editsLength = new File(new File(dir, Storage.STORAGE_DIR_CURRENT), 
         NameNodeFile.EDITS.getName()).length();
 
     if(dir.exists() && !(FileUtil.fullyDelete(dir)))
@@ -262,7 +297,7 @@ public class TestStartup {
     config.set(DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_DIR_KEY,
         fileAsURI(new File(hdfsDir, "chkpt")).toString());
 
-    createCheckPoint();
+    createCheckPoint(1);
 
     corruptNameNodeFiles();
     checkNameNodeFiles();
@@ -289,7 +324,7 @@ public class TestStartup {
     config.set(DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_DIR_KEY,
         fileAsURI(new File(hdfsDir, "chkpt")).toString());
 
-    createCheckPoint();
+    createCheckPoint(1);
     corruptNameNodeFiles();
     checkNameNodeFiles();
   }
@@ -447,20 +482,18 @@ public class TestStartup {
         FileSystem fs = cluster.getFileSystem();
         fs.mkdirs(new Path("/test"));
         
-        // Directory layout looks like:
-        // test/data/dfs/nameN/current/{fsimage,edits,...}
-        File nameDir = new File(cluster.getNameDirs(0).iterator().next().getPath());
-        File dfsDir = nameDir.getParentFile();
-        assertEquals(dfsDir.getName(), "dfs"); // make sure we got right dir
-        
         LOG.info("Shutting down cluster #1");
         cluster.shutdown();
         cluster = null;
 
-        // Corrupt the md5 file to all 0s
-        File imageFile = new File(nameDir, "current/" + NNStorage.getImageFileName(0));
-        MD5FileUtils.saveMD5File(imageFile, new MD5Hash(new byte[16]));
-        
+        // Corrupt the md5 files in all the namedirs
+        corruptFSImageMD5(true);
+
+        // Attach our own log appender so we can verify output
+        final LogVerificationAppender appender = new LogVerificationAppender();
+        final Logger logger = Logger.getRootLogger();
+        logger.addAppender(appender);
+
         // Try to start a new cluster
         LOG.info("\n===========================================\n" +
         "Starting same cluster after simulated crash");
@@ -471,9 +504,12 @@ public class TestStartup {
             .build();
           fail("Should not have successfully started with corrupt image");
         } catch (IOException ioe) {
-          if (!ioe.getCause().getMessage().contains("is corrupt with MD5")) {
-            throw ioe;
-          }
+          GenericTestUtils.assertExceptionContains(
+              "Failed to load an FSImage file!", ioe);
+          int md5failures = appender.countExceptionsWithMessage(
+              " is corrupt with MD5 checksum of ");
+          // Two namedirs, so should have seen two failures
+          assertEquals(2, md5failures);
         }
     } finally {
       if (cluster != null) {
@@ -482,6 +518,21 @@ public class TestStartup {
     }
   }
   
+  @Test(timeout=30000)
+  public void testCorruptImageFallback() throws IOException {
+    // Create two checkpoints
+    createCheckPoint(2);
+    // Delete a single md5sum
+    corruptFSImageMD5(false);
+    // Should still be able to start
+    MiniDFSCluster cluster = new MiniDFSCluster.Builder(config)
+        .format(false)
+        .manageDataDfsDirs(false)
+        .manageNameDfsDirs(false)
+        .build();
+    cluster.waitActive();
+}
+
   /**
    * This test tests hosts include list contains host names.  After namenode
    * restarts, the still alive datanodes should not have any trouble in getting

+ 51 - 0
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestStandbyCheckpoints.java

@@ -19,6 +19,7 @@ package org.apache.hadoop.hdfs.server.namenode.ha;
 
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
 
 import java.io.File;
 import java.io.IOException;
@@ -26,6 +27,8 @@ import java.io.OutputStream;
 import java.net.URI;
 import java.util.List;
 
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
@@ -43,7 +46,10 @@ import org.apache.hadoop.hdfs.util.Canceler;
 import org.apache.hadoop.io.compress.CompressionCodecFactory;
 import org.apache.hadoop.io.compress.CompressionOutputStream;
 import org.apache.hadoop.io.compress.GzipCodec;
+import org.apache.hadoop.ipc.StandbyException;
 import org.apache.hadoop.test.GenericTestUtils;
+import org.apache.hadoop.test.GenericTestUtils.DelayAnswer;
+import org.apache.hadoop.util.ThreadUtil;
 import org.junit.After;
 import org.junit.Before;
 import org.junit.Test;
@@ -59,6 +65,8 @@ public class TestStandbyCheckpoints {
   protected MiniDFSCluster cluster;
   protected NameNode nn0, nn1;
   protected FileSystem fs;
+  
+  private static final Log LOG = LogFactory.getLog(TestStandbyCheckpoints.class);
 
   @SuppressWarnings("rawtypes")
   @Before
@@ -231,6 +239,49 @@ public class TestStandbyCheckpoints {
     
     assertTrue(canceledOne);
   }
+  
+  /**
+   * Make sure that clients will receive StandbyExceptions even when a
+   * checkpoint is in progress on the SBN, and therefore the StandbyCheckpointer
+   * thread will have FSNS lock. Regression test for HDFS-4591.
+   */
+  @Test(timeout=120000)
+  public void testStandbyExceptionThrownDuringCheckpoint() throws Exception {
+    
+    // Set it up so that we know when the SBN checkpoint starts and ends.
+    FSImage spyImage1 = NameNodeAdapter.spyOnFsImage(nn1);
+    DelayAnswer answerer = new DelayAnswer(LOG);
+    Mockito.doAnswer(answerer).when(spyImage1)
+        .saveNamespace(Mockito.any(FSNamesystem.class),
+            Mockito.any(Canceler.class));
+    
+    // Perform some edits and wait for a checkpoint to start on the SBN.
+    doEdits(0, 2000);
+    nn0.getRpcServer().rollEditLog();
+    answerer.waitForCall();
+    answerer.proceed();
+    assertTrue("SBN is not performing checkpoint but it should be.",
+        answerer.getFireCount() == 1 && answerer.getResultCount() == 0);
+    
+    // Make sure that the lock has actually been taken by the checkpointing
+    // thread.
+    ThreadUtil.sleepAtLeastIgnoreInterrupts(1000);
+    try {
+      // Perform an RPC to the SBN and make sure it throws a StandbyException.
+      nn1.getRpcServer().getFileInfo("/");
+      fail("Should have thrown StandbyException, but instead succeeded.");
+    } catch (StandbyException se) {
+      GenericTestUtils.assertExceptionContains("is not supported", se);
+    }
+    
+    // Make sure that the checkpoint is still going on, implying that the client
+    // RPC to the SBN happened during the checkpoint.
+    assertTrue("SBN should have still been checkpointing.",
+        answerer.getFireCount() == 1 && answerer.getResultCount() == 0);
+    answerer.waitForResult();
+    assertTrue("SBN should have finished checkpointing.",
+        answerer.getFireCount() == 1 && answerer.getResultCount() == 1);
+  }
 
   private void doEdits(int start, int stop) throws IOException {
     for (int i = start; i < stop; i++) {

+ 1 - 0
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestStandbyIsHot.java

@@ -143,6 +143,7 @@ public class TestStandbyIsHot {
     conf.setInt(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, 1024);
     // We read from the standby to watch block locations
     HAUtil.setAllowStandbyReads(conf, true);
+    conf.setLong(DFSConfigKeys.DFS_NAMENODE_ACCESSTIME_PRECISION_KEY, 0);
     conf.setInt(DFSConfigKeys.DFS_HA_TAILEDITS_PERIOD_KEY, 1);
     MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf)
       .nnTopology(MiniDFSNNTopology.simpleHATopology())

+ 11 - 0
hadoop-mapreduce-project/CHANGES.txt

@@ -230,6 +230,11 @@ Release 2.0.5-beta - UNRELEASED
     appropriately used and that on-disk segments are correctly sorted on
     file-size. (Anty Rao and Ravi Prakash via acmurthy) 
 
+    MAPREDUCE-4571. TestHsWebServicesJobs fails on jdk7. (tgraves via tucu)
+
+    MAPREDUCE-4716. TestHsWebServicesJobsQuery.testJobsQueryStateInvalid 
+    fails with jdk7. (tgraves via tucu)
+
 Release 2.0.4-alpha - UNRELEASED
 
   INCOMPATIBLE CHANGES
@@ -807,6 +812,12 @@ Release 0.23.7 - UNRELEASED
     MAPREDUCE-5023. History Server Web Services missing Job Counters (Ravi
     Prakash via tgraves)
 
+    MAPREDUCE-5060. Fetch failures that time out only count against the first
+    map task (Robert Joseph Evans via jlowe)
+
+    MAPREDUCE-5042. Reducer unable to fetch for a map task that was recovered
+    (Jason Lowe via bobby)
+
 Release 0.23.6 - UNRELEASED
 
   INCOMPATIBLE CHANGES

+ 9 - 1
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapred/YarnChild.java

@@ -269,9 +269,17 @@ class YarnChild {
     job.setBoolean("ipc.client.tcpnodelay", true);
     job.setClass(MRConfig.TASK_LOCAL_OUTPUT_CLASS,
         YarnOutputFiles.class, MapOutputFile.class);
-    // set the jobTokenFile into task
+    // set the jobToken and shuffle secrets into task
     task.setJobTokenSecret(
         JobTokenSecretManager.createSecretKey(jt.getPassword()));
+    byte[] shuffleSecret = TokenCache.getShuffleSecretKey(credentials);
+    if (shuffleSecret == null) {
+      LOG.warn("Shuffle secret missing from task credentials."
+          + " Using job token secret as shuffle secret.");
+      shuffleSecret = jt.getPassword();
+    }
+    task.setShuffleSecret(
+        JobTokenSecretManager.createSecretKey(shuffleSecret));
 
     // setup the child's MRConfig.LOCAL_DIR.
     configureLocalDirs(task, job);

+ 16 - 3
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/MRAppMaster.java

@@ -55,6 +55,7 @@ import org.apache.hadoop.mapreduce.jobhistory.JobHistoryCopyService;
 import org.apache.hadoop.mapreduce.jobhistory.JobHistoryEvent;
 import org.apache.hadoop.mapreduce.jobhistory.JobHistoryEventHandler;
 import org.apache.hadoop.mapreduce.jobhistory.JobHistoryParser.TaskInfo;
+import org.apache.hadoop.mapreduce.security.TokenCache;
 import org.apache.hadoop.mapreduce.security.token.JobTokenSecretManager;
 import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl;
 import org.apache.hadoop.mapreduce.v2.api.records.AMInfo;
@@ -339,8 +340,15 @@ public class MRAppMaster extends CompositeService {
       boolean recoveryEnabled = conf.getBoolean(
           MRJobConfig.MR_AM_JOB_RECOVERY_ENABLE, true);
       boolean recoverySupportedByCommitter = committer.isRecoverySupported();
+
+      // If a shuffle secret was not provided by the job client then this app
+      // attempt will generate one.  However that disables recovery if there
+      // are reducers as the shuffle secret would be app attempt specific.
+      boolean shuffleKeyValidForRecovery = (numReduceTasks > 0 &&
+          TokenCache.getShuffleSecretKey(fsTokens) != null);
+
       if (recoveryEnabled && recoverySupportedByCommitter
-          && appAttemptID.getAttemptId() > 1) {
+          && shuffleKeyValidForRecovery && appAttemptID.getAttemptId() > 1) {
         LOG.info("Recovery is enabled. "
             + "Will try to recover from previous life on best effort basis.");
         recoveryServ = createRecoveryService(context);
@@ -351,7 +359,8 @@ public class MRAppMaster extends CompositeService {
       } else {
         LOG.info("Not starting RecoveryService: recoveryEnabled: "
             + recoveryEnabled + " recoverySupportedByCommitter: "
-            + recoverySupportedByCommitter + " ApplicationAttemptID: "
+            + recoverySupportedByCommitter + " shuffleKeyValidForRecovery: "
+            + shuffleKeyValidForRecovery + " ApplicationAttemptID: "
             + appAttemptID.getAttemptId());
         dispatcher = createDispatcher();
         addIfService(dispatcher);
@@ -471,7 +480,11 @@ public class MRAppMaster extends CompositeService {
   protected FileSystem getFileSystem(Configuration conf) throws IOException {
     return FileSystem.get(conf);
   }
-  
+
+  protected Credentials getCredentials() {
+    return fsTokens;
+  }
+
   /**
    * clean up staging directories for the job.
    * @throws IOException

+ 7 - 7
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/job/impl/JobImpl.java

@@ -1350,13 +1350,13 @@ public class JobImpl implements org.apache.hadoop.mapreduce.v2.app.job.Job,
       LOG.info("Adding job token for " + oldJobIDString
           + " to jobTokenSecretManager");
 
-      // Upload the jobTokens onto the remote FS so that ContainerManager can
-      // localize it to be used by the Containers(tasks)
-      Credentials tokenStorage = new Credentials();
-      TokenCache.setJobToken(job.jobToken, tokenStorage);
-
-      if (UserGroupInformation.isSecurityEnabled()) {
-        tokenStorage.addAll(job.fsTokens);
+      // If the job client did not setup the shuffle secret then reuse
+      // the job token secret for the shuffle.
+      if (TokenCache.getShuffleSecretKey(job.fsTokens) == null) {
+        LOG.warn("Shuffle secret key missing from job credentials."
+            + " Using job token secret as shuffle secret.");
+        TokenCache.setShuffleSecretKey(job.jobToken.getPassword(),
+            job.fsTokens);
       }
     }
 

+ 13 - 2
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/job/impl/TaskAttemptImpl.java

@@ -702,10 +702,21 @@ public abstract class TaskAttemptImpl implements
           ByteBuffer.wrap(containerTokens_dob.getData(), 0,
               containerTokens_dob.getLength());
 
-      // Add shuffle token
+      // Add shuffle secret key
+      // The secret key is converted to a JobToken to preserve backwards
+      // compatibility with an older ShuffleHandler running on an NM.
       LOG.info("Putting shuffle token in serviceData");
+      byte[] shuffleSecret = TokenCache.getShuffleSecretKey(credentials);
+      if (shuffleSecret == null) {
+        LOG.warn("Cannot locate shuffle secret in credentials."
+            + " Using job token as shuffle secret.");
+        shuffleSecret = jobToken.getPassword();
+      }
+      Token<JobTokenIdentifier> shuffleToken = new Token<JobTokenIdentifier>(
+          jobToken.getIdentifier(), shuffleSecret, jobToken.getKind(),
+          jobToken.getService());
       serviceData.put(ShuffleHandler.MAPREDUCE_SHUFFLE_SERVICEID,
-          ShuffleHandler.serializeServiceData(jobToken));
+          ShuffleHandler.serializeServiceData(shuffleToken));
 
       Apps.addToEnvironment(
           environment,  

+ 1 - 1
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/webapp/CountersBlock.java

@@ -111,7 +111,7 @@ public class CountersBlock extends HtmlBlock {
           th().$title(g.getName()).$class("ui-state-default").
             _(fixGroupDisplayName(g.getDisplayName()))._().
           td().$class(C_TABLE).
-            table(".dt-counters").
+            table(".dt-counters").$id(job.getID()+"."+g.getName()).
               thead().
                 tr().th(".name", "Name");
 

+ 4 - 0
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/test/java/org/apache/hadoop/mapreduce/v2/app/MRApp.java

@@ -42,6 +42,7 @@ import org.apache.hadoop.mapreduce.TaskAttemptContext;
 import org.apache.hadoop.mapreduce.TypeConverter;
 import org.apache.hadoop.mapreduce.jobhistory.JobHistoryEvent;
 import org.apache.hadoop.mapreduce.jobhistory.NormalizedResourceEvent;
+import org.apache.hadoop.mapreduce.security.TokenCache;
 import org.apache.hadoop.mapreduce.security.token.JobTokenSecretManager;
 import org.apache.hadoop.mapreduce.split.JobSplit.TaskSplitMetaInfo;
 import org.apache.hadoop.mapreduce.v2.api.records.JobId;
@@ -144,6 +145,9 @@ public class MRApp extends MRAppMaster {
   
   @Override
   protected void downloadTokensAndSetupUGI(Configuration conf) {
+    // Fake a shuffle secret that normally is provided by the job client.
+    String shuffleSecret = "fake-shuffle-secret";
+    TokenCache.setShuffleSecretKey(shuffleSecret.getBytes(), getCredentials());
   }
 
   private static ApplicationAttemptId getApplicationAttemptId(

+ 123 - 0
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/test/java/org/apache/hadoop/mapreduce/v2/app/TestRecovery.java

@@ -900,6 +900,117 @@ public class TestRecovery {
 
   }
 
+  @Test(timeout=30000)
+  public void testRecoveryWithoutShuffleSecret() throws Exception {
+
+    int runCount = 0;
+    MRApp app = new MRAppNoShuffleSecret(2, 1, false,
+        this.getClass().getName(), true, ++runCount);
+    Configuration conf = new Configuration();
+    conf.setBoolean("mapred.mapper.new-api", true);
+    conf.setBoolean("mapred.reducer.new-api", true);
+    conf.setBoolean(MRJobConfig.JOB_UBERTASK_ENABLE, false);
+    conf.set(FileOutputFormat.OUTDIR, outputDir.toString());
+    Job job = app.submit(conf);
+    app.waitForState(job, JobState.RUNNING);
+    //all maps would be running
+    Assert.assertEquals("No of tasks not correct",
+       3, job.getTasks().size());
+    Iterator<Task> it = job.getTasks().values().iterator();
+    Task mapTask1 = it.next();
+    Task mapTask2 = it.next();
+    Task reduceTask = it.next();
+
+    // all maps must be running
+    app.waitForState(mapTask1, TaskState.RUNNING);
+    app.waitForState(mapTask2, TaskState.RUNNING);
+
+    TaskAttempt task1Attempt = mapTask1.getAttempts().values().iterator().next();
+    TaskAttempt task2Attempt = mapTask2.getAttempts().values().iterator().next();
+
+    //before sending the TA_DONE, event make sure attempt has come to
+    //RUNNING state
+    app.waitForState(task1Attempt, TaskAttemptState.RUNNING);
+    app.waitForState(task2Attempt, TaskAttemptState.RUNNING);
+
+    // reduces must be in NEW state
+    Assert.assertEquals("Reduce Task state not correct",
+        TaskState.RUNNING, reduceTask.getReport().getTaskState());
+
+    //send the done signal to the 1st map attempt
+    app.getContext().getEventHandler().handle(
+        new TaskAttemptEvent(
+            task1Attempt.getID(),
+            TaskAttemptEventType.TA_DONE));
+
+    //wait for first map task to complete
+    app.waitForState(mapTask1, TaskState.SUCCEEDED);
+
+    //stop the app
+    app.stop();
+
+    //in recovery the 1st map should NOT be recovered from previous run
+    //since the shuffle secret was not provided with the job credentials
+    //and had to be rolled per app attempt
+    app = new MRAppNoShuffleSecret(2, 1, false,
+        this.getClass().getName(), false, ++runCount);
+    conf = new Configuration();
+    conf.setBoolean(MRJobConfig.MR_AM_JOB_RECOVERY_ENABLE, true);
+    conf.setBoolean("mapred.mapper.new-api", true);
+    conf.setBoolean("mapred.reducer.new-api", true);
+    conf.set(FileOutputFormat.OUTDIR, outputDir.toString());
+    conf.setBoolean(MRJobConfig.JOB_UBERTASK_ENABLE, false);
+    job = app.submit(conf);
+    app.waitForState(job, JobState.RUNNING);
+    //all maps would be running
+    Assert.assertEquals("No of tasks not correct",
+       3, job.getTasks().size());
+    it = job.getTasks().values().iterator();
+    mapTask1 = it.next();
+    mapTask2 = it.next();
+    reduceTask = it.next();
+
+    app.waitForState(mapTask1, TaskState.RUNNING);
+    app.waitForState(mapTask2, TaskState.RUNNING);
+
+    task2Attempt = mapTask2.getAttempts().values().iterator().next();
+    //before sending the TA_DONE, event make sure attempt has come to
+    //RUNNING state
+    app.waitForState(task2Attempt, TaskAttemptState.RUNNING);
+
+    //send the done signal to the 2nd map task
+    app.getContext().getEventHandler().handle(
+        new TaskAttemptEvent(
+            mapTask2.getAttempts().values().iterator().next().getID(),
+            TaskAttemptEventType.TA_DONE));
+
+    //wait to get it completed
+    app.waitForState(mapTask2, TaskState.SUCCEEDED);
+
+    //verify first map task is still running
+    app.waitForState(mapTask1, TaskState.RUNNING);
+
+    //send the done signal to the 2nd map task
+    app.getContext().getEventHandler().handle(
+        new TaskAttemptEvent(
+            mapTask1.getAttempts().values().iterator().next().getID(),
+            TaskAttemptEventType.TA_DONE));
+
+    //wait to get it completed
+    app.waitForState(mapTask1, TaskState.SUCCEEDED);
+
+    //wait for reduce to be running before sending done
+    app.waitForState(reduceTask, TaskState.RUNNING);
+    //send the done signal to the reduce
+    app.getContext().getEventHandler().handle(
+        new TaskAttemptEvent(
+            reduceTask.getAttempts().values().iterator().next().getID(),
+            TaskAttemptEventType.TA_DONE));
+
+    app.waitForState(job, JobState.SUCCEEDED);
+    app.verifyCompleted();
+  }
+
   private void writeBadOutput(TaskAttempt attempt, Configuration conf)
   throws Exception {
   TaskAttemptContext tContext = new TaskAttemptContextImpl(conf, 
@@ -1019,6 +1130,18 @@ public class TestRecovery {
     }
   }
 
+  static class MRAppNoShuffleSecret extends MRAppWithHistory {
+    public MRAppNoShuffleSecret(int maps, int reduces, boolean autoComplete,
+        String testName, boolean cleanOnStart, int startCount) {
+      super(maps, reduces, autoComplete, testName, cleanOnStart, startCount);
+    }
+
+    @Override
+    protected void downloadTokensAndSetupUGI(Configuration conf) {
+      // do NOT put a shuffle secret in the job credentials
+    }
+  }
+
   public static void main(String[] arg) throws Exception {
     TestRecovery test = new TestRecovery();
     test.testCrashed();

+ 1 - 1
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/test/java/org/apache/hadoop/mapreduce/v2/app/job/impl/TestJobImpl.java

@@ -491,7 +491,7 @@ public class TestJobImpl {
     MRAppMetrics mrAppMetrics = MRAppMetrics.create();
     JobImpl job = new JobImpl(jobId, Records
         .newRecord(ApplicationAttemptId.class), conf, mock(EventHandler.class),
-        null, mock(JobTokenSecretManager.class), null, null, null,
+        null, new JobTokenSecretManager(), new Credentials(), null, null,
         mrAppMetrics, true, null, 0, null, null, null, null);
     InitTransition initTransition = getInitTransition(2);
     JobEvent mockJobEvent = mock(JobEvent.class);

+ 17 - 1
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/Task.java

@@ -185,6 +185,7 @@ abstract public class Task implements Writable, Configurable {
   private int numSlotsRequired;
   protected TaskUmbilicalProtocol umbilical;
   protected SecretKey tokenSecret;
+  protected SecretKey shuffleSecret;
   protected GcTimeUpdater gcUpdater;
 
   ////////////////////////////////////////////
@@ -261,7 +262,22 @@ abstract public class Task implements Writable, Configurable {
     return this.tokenSecret;
   }
 
-  
+  /**
+   * Set the secret key used to authenticate the shuffle
+   * @param shuffleSecret the secret
+   */
+  public void setShuffleSecret(SecretKey shuffleSecret) {
+    this.shuffleSecret = shuffleSecret;
+  }
+
+  /**
+   * Get the secret key used to authenticate the shuffle
+   * @return the shuffle secret
+   */
+  public SecretKey getShuffleSecret() {
+    return this.shuffleSecret;
+  }
+
   /**
    * Get the index of this task within the job.
    * @return the integer part of the task id

+ 20 - 0
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/JobSubmitter.java

@@ -23,11 +23,15 @@ import java.net.InetAddress;
 import java.net.URI;
 import java.net.URISyntaxException;
 import java.net.UnknownHostException;
+import java.security.NoSuchAlgorithmException;
 import java.util.Arrays;
 import java.util.Comparator;
 import java.util.List;
 import java.util.Map;
 
+import javax.crypto.KeyGenerator;
+import javax.crypto.SecretKey;
+
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.classification.InterfaceAudience;
@@ -62,6 +66,8 @@ import com.google.common.base.Charsets;
 @InterfaceStability.Unstable
 class JobSubmitter {
   protected static final Log LOG = LogFactory.getLog(JobSubmitter.class);
+  private static final String SHUFFLE_KEYGEN_ALGORITHM = "HmacSHA1";
+  private static final int SHUFFLE_KEY_LENGTH = 64;
   private FileSystem jtFs;
   private ClientProtocol submitClient;
   private String submitHostName;
@@ -359,6 +365,20 @@ class JobSubmitter {
       
       populateTokenCache(conf, job.getCredentials());
 
+      // generate a secret to authenticate shuffle transfers
+      if (TokenCache.getShuffleSecretKey(job.getCredentials()) == null) {
+        KeyGenerator keyGen;
+        try {
+          keyGen = KeyGenerator.getInstance(SHUFFLE_KEYGEN_ALGORITHM);
+          keyGen.init(SHUFFLE_KEY_LENGTH);
+        } catch (NoSuchAlgorithmException e) {
+          throw new IOException("Error generating shuffle secret key", e);
+        }
+        SecretKey shuffleKey = keyGen.generateKey();
+        TokenCache.setShuffleSecretKey(shuffleKey.getEncoded(),
+            job.getCredentials());
+      }
+
       copyAndConfigureFiles(job, submitJobDir);
       Path submitJobFile = JobSubmissionFiles.getJobConfPath(submitJobDir);
       

+ 12 - 1
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/security/TokenCache.java

@@ -154,7 +154,8 @@ public class TokenCache {
    */
   @InterfaceAudience.Private
   public static final String JOB_TOKENS_FILENAME = "mapreduce.job.jobTokenFile";
-  private static final Text JOB_TOKEN = new Text("ShuffleAndJobToken");
+  private static final Text JOB_TOKEN = new Text("JobToken");
+  private static final Text SHUFFLE_TOKEN = new Text("MapReduceShuffleToken");
   
   /**
    * load job token from a file
@@ -194,4 +195,14 @@ public class TokenCache {
   public static Token<JobTokenIdentifier> getJobToken(Credentials credentials) {
     return (Token<JobTokenIdentifier>) credentials.getToken(JOB_TOKEN);
   }
+
+  @InterfaceAudience.Private
+  public static void setShuffleSecretKey(byte[] key, Credentials credentials) {
+    credentials.addSecretKey(SHUFFLE_TOKEN, key);
+  }
+
+  @InterfaceAudience.Private
+  public static byte[] getShuffleSecretKey(Credentials credentials) {
+    return getSecretKey(credentials, SHUFFLE_TOKEN);
+  }
 }

+ 9 - 18
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/task/reduce/Fetcher.java

@@ -82,7 +82,7 @@ class Fetcher<K,V> extends Thread {
   private final int connectionTimeout;
   private final int readTimeout;
   
-  private final SecretKey jobTokenSecret;
+  private final SecretKey shuffleSecretKey;
 
   private volatile boolean stopped = false;
 
@@ -92,7 +92,7 @@ class Fetcher<K,V> extends Thread {
   public Fetcher(JobConf job, TaskAttemptID reduceId, 
                  ShuffleScheduler<K,V> scheduler, MergeManager<K,V> merger,
                  Reporter reporter, ShuffleClientMetrics metrics,
-                 ExceptionReporter exceptionReporter, SecretKey jobTokenSecret) {
+                 ExceptionReporter exceptionReporter, SecretKey shuffleKey) {
     this.reporter = reporter;
     this.scheduler = scheduler;
     this.merger = merger;
@@ -100,7 +100,7 @@ class Fetcher<K,V> extends Thread {
     this.exceptionReporter = exceptionReporter;
     this.id = ++nextId;
     this.reduce = reduceId.getTaskID().getId();
-    this.jobTokenSecret = jobTokenSecret;
+    this.shuffleSecretKey = shuffleKey;
     ioErrs = reporter.getCounter(SHUFFLE_ERR_GRP_NAME,
         ShuffleErrors.IO_ERROR.toString());
     wrongLengthErrs = reporter.getCounter(SHUFFLE_ERR_GRP_NAME,
@@ -221,7 +221,6 @@ class Fetcher<K,V> extends Thread {
     
     // Construct the url and connect
     DataInputStream input;
-    boolean connectSucceeded = false;
     
     try {
       URL url = getMapOutputURL(host, maps);
@@ -229,7 +228,8 @@ class Fetcher<K,V> extends Thread {
       
       // generate hash of the url
       String msgToEncode = SecureShuffleUtils.buildMsgFrom(url);
-      String encHash = SecureShuffleUtils.hashFromString(msgToEncode, jobTokenSecret);
+      String encHash = SecureShuffleUtils.hashFromString(msgToEncode,
+          shuffleSecretKey);
       
       // put url hash into http header
       connection.addRequestProperty(
@@ -237,7 +237,6 @@ class Fetcher<K,V> extends Thread {
       // set the read timeout
       connection.setReadTimeout(readTimeout);
       connect(connection, connectionTimeout);
-      connectSucceeded = true;
       input = new DataInputStream(connection.getInputStream());
 
       // Validate response code
@@ -255,7 +254,7 @@ class Fetcher<K,V> extends Thread {
       }
       LOG.debug("url="+msgToEncode+";encHash="+encHash+";replyHash="+replyHash);
       // verify that replyHash is HMac of encHash
-      SecureShuffleUtils.verifyReply(replyHash, encHash, jobTokenSecret);
+      SecureShuffleUtils.verifyReply(replyHash, encHash, shuffleSecretKey);
       LOG.info("for url="+msgToEncode+" sent hash and received reply");
     } catch (IOException ie) {
       boolean connectExcpt = ie instanceof ConnectException;
@@ -265,18 +264,10 @@ class Fetcher<K,V> extends Thread {
 
       // If connect did not succeed, just mark all the maps as failed,
       // indirectly penalizing the host
-      if (!connectSucceeded) {
-        for(TaskAttemptID left: remaining) {
-          scheduler.copyFailed(left, host, connectSucceeded, connectExcpt);
-        }
-      } else {
-        // If we got a read error at this stage, it implies there was a problem
-        // with the first map, typically lost map. So, penalize only that map
-        // and add the rest
-        TaskAttemptID firstMap = maps.get(0);
-        scheduler.copyFailed(firstMap, host, connectSucceeded, connectExcpt);
+      for(TaskAttemptID left: remaining) {
+        scheduler.copyFailed(left, host, false, connectExcpt);
       }
-      
+     
       // Add back all the remaining maps, WITHOUT marking them as failed
       for(TaskAttemptID left: remaining) {
         scheduler.putBackKnownMapOutput(host, left);

+ 1 - 1
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/task/reduce/Shuffle.java

@@ -108,7 +108,7 @@ public class Shuffle<K, V> implements ShuffleConsumerPlugin<K, V>, ExceptionRepo
     for (int i=0; i < numFetchers; ++i) {
       fetchers[i] = new Fetcher<K,V>(jobConf, reduceId, scheduler, merger, 
                                      reporter, metrics, this, 
-                                     reduceTask.getJobTokenSecret());
+                                     reduceTask.getShuffleSecret());
       fetchers[i].start();
     }
     

+ 49 - 0
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/task/reduce/TestFetcher.java

@@ -26,6 +26,7 @@ import java.io.ByteArrayOutputStream;
 import java.io.DataOutputStream;
 import java.io.IOException;
 import java.net.HttpURLConnection;
+import java.net.SocketTimeoutException;
 import java.net.URL;
 import java.util.ArrayList;
 
@@ -70,6 +71,54 @@ public class TestFetcher {
     }
   }
   
+  @SuppressWarnings("unchecked")
+  @Test(timeout=30000)
+  public void testCopyFromHostConnectionTimeout() throws Exception {
+    LOG.info("testCopyFromHostConnectionTimeout");
+    JobConf job = new JobConf();
+    TaskAttemptID id = TaskAttemptID.forName("attempt_0_1_r_1_1");
+    ShuffleScheduler<Text, Text> ss = mock(ShuffleScheduler.class);
+    MergeManagerImpl<Text, Text> mm = mock(MergeManagerImpl.class);
+    Reporter r = mock(Reporter.class);
+    ShuffleClientMetrics metrics = mock(ShuffleClientMetrics.class);
+    ExceptionReporter except = mock(ExceptionReporter.class);
+    SecretKey key = JobTokenSecretManager.createSecretKey(new byte[]{0,0,0,0});
+    HttpURLConnection connection = mock(HttpURLConnection.class);
+    when(connection.getInputStream()).thenThrow(
+        new SocketTimeoutException("This is a fake timeout :)"));
+    
+    Counters.Counter allErrs = mock(Counters.Counter.class);
+    when(r.getCounter(anyString(), anyString()))
+      .thenReturn(allErrs);
+    
+    Fetcher<Text,Text> underTest = new FakeFetcher<Text,Text>(job, id, ss, mm,
+        r, metrics, except, key, connection);
+
+    MapHost host = new MapHost("localhost", "http://localhost:8080/");
+    
+    ArrayList<TaskAttemptID> maps = new ArrayList<TaskAttemptID>(1);
+    TaskAttemptID map1ID = TaskAttemptID.forName("attempt_0_1_m_1_1");
+    maps.add(map1ID);
+    TaskAttemptID map2ID = TaskAttemptID.forName("attempt_0_1_m_2_1");
+    maps.add(map2ID);
+    when(ss.getMapsForHost(host)).thenReturn(maps);
+    
+    String encHash = "vFE234EIFCiBgYs2tCXY/SjT8Kg=";
+    
+    underTest.copyFromHost(host);
+    
+    verify(connection)
+      .addRequestProperty(SecureShuffleUtils.HTTP_HEADER_URL_HASH, 
+          encHash);
+    
+    verify(allErrs).increment(1);
+    verify(ss).copyFailed(map1ID, host, false, false);
+    verify(ss).copyFailed(map2ID, host, false, false);
+    
+    verify(ss).putBackKnownMapOutput(any(MapHost.class), eq(map1ID));
+    verify(ss).putBackKnownMapOutput(any(MapHost.class), eq(map2ID));
+  }
+  
   @SuppressWarnings("unchecked")
   @Test
   public void testCopyFromHostBogusHeader() throws Exception {

+ 6 - 2
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/src/main/java/org/apache/hadoop/mapreduce/v2/hs/webapp/HsTasksBlock.java

@@ -65,8 +65,12 @@ public class HsTasksBlock extends HtmlBlock {
     if (!symbol.isEmpty()) {
       type = MRApps.taskType(symbol);
     }
-
-    THEAD<TABLE<Hamlet>> thead = html.table("#tasks").thead();
+    THEAD<TABLE<Hamlet>> thead;
+    if(type != null)
+      thead = html.table("#"+app.getJob().getID() 
+        + type).$class("dt-tasks").thead();
+    else
+      thead = html.table("#tasks").thead();
     //Create the spanning row
     int attemptColSpan = type == TaskType.REDUCE ? 8 : 3;
     thead.tr().

+ 4 - 0
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/src/main/java/org/apache/hadoop/mapreduce/v2/hs/webapp/HsTasksPage.java

@@ -22,7 +22,9 @@ import static org.apache.hadoop.mapreduce.v2.app.webapp.AMParams.TASK_TYPE;
 import static org.apache.hadoop.yarn.webapp.view.JQueryUI.ACCORDION;
 import static org.apache.hadoop.yarn.webapp.view.JQueryUI.DATATABLES;
 import static org.apache.hadoop.yarn.webapp.view.JQueryUI.DATATABLES_ID;
+import static org.apache.hadoop.yarn.webapp.view.JQueryUI.DATATABLES_SELECTOR;
 import static org.apache.hadoop.yarn.webapp.view.JQueryUI.initID;
+import static org.apache.hadoop.yarn.webapp.view.JQueryUI.initSelector;
 import static org.apache.hadoop.yarn.webapp.view.JQueryUI.postInitID;
 import static org.apache.hadoop.yarn.webapp.view.JQueryUI.tableInit;
 
@@ -42,6 +44,8 @@ public class HsTasksPage extends HsView {
   @Override protected void preHead(Page.HTML<_> html) {
     commonPreHead(html);
     set(DATATABLES_ID, "tasks");
+    set(DATATABLES_SELECTOR, ".dt-tasks" );
+    set(initSelector(DATATABLES), tasksTableInit());
     set(initID(ACCORDION, "nav"), "{autoHeight:false, active:1}");
     set(initID(DATATABLES, "tasks"), tasksTableInit());
     set(postInitID(DATATABLES, "tasks"), jobsPostTableInit());

+ 16 - 7
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/src/test/java/org/apache/hadoop/mapreduce/v2/hs/MockHistoryJobs.java

@@ -77,13 +77,18 @@ public class MockHistoryJobs extends MockJobs {
     for(Map.Entry<JobId, Job> entry: mocked.entrySet()) {
       JobId id = entry.getKey();
       Job j = entry.getValue();
-      ret.full.put(id, new MockCompletedJob(j));
-      JobReport report = j.getReport();
+      MockCompletedJob mockJob = new MockCompletedJob(j);
+      // use MockCompletedJob to set everything below to make sure
+      // consistent with what history server would do
+      ret.full.put(id, mockJob);
+      JobReport report = mockJob.getReport();
       JobIndexInfo info = new JobIndexInfo(report.getStartTime(), 
-          report.getFinishTime(), j.getUserName(), j.getName(), id, 
-          j.getCompletedMaps(), j.getCompletedReduces(), String.valueOf(j.getState()));
-      info.setQueueName(j.getQueueName());
+          report.getFinishTime(), mockJob.getUserName(), mockJob.getName(), id, 
+          mockJob.getCompletedMaps(), mockJob.getCompletedReduces(),
+          String.valueOf(mockJob.getState()));
+      info.setQueueName(mockJob.getQueueName());
       ret.partial.put(id, new PartialJob(info, id));
+
     }
     return ret;
   }
@@ -99,12 +104,16 @@ public class MockHistoryJobs extends MockJobs {
 
     @Override
     public int getCompletedMaps() {
-      return job.getCompletedMaps();
+      // we always return total since this is history server
+      // and PartialJob also assumes completed - total
+      return job.getTotalMaps();
     }
 
     @Override
     public int getCompletedReduces() {
-      return job.getCompletedReduces();
+      // we always return total since this is history server
+      // and PartialJob also assumes completed - total
+      return job.getTotalReduces();
     }
 
     @Override

+ 7 - 5
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/src/test/java/org/apache/hadoop/mapreduce/v2/hs/webapp/TestHsWebServicesJobs.java

@@ -117,6 +117,7 @@ public class TestHsWebServicesJobs extends JerseyTest {
       fullJobs = jobs.full;
     }
 
+
     TestAppContext(int appid, int numJobs, int numTasks, int numAttempts) {
       this(appid, numJobs, numTasks, numAttempts, false);
     }
@@ -411,7 +412,8 @@ public class TestHsWebServicesJobs extends JerseyTest {
       JSONObject json = response.getEntity(JSONObject.class);
       assertEquals("incorrect number of elements", 1, json.length());
       JSONObject info = json.getJSONObject("job");
-      VerifyJobsUtils.verifyHsJob(info, jobsMap.get(id));
+
+      VerifyJobsUtils.verifyHsJob(info, appContext.getJob(id));
     }
   }
 
@@ -613,7 +615,7 @@ public class TestHsWebServicesJobs extends JerseyTest {
       JSONObject json = response.getEntity(JSONObject.class);
       assertEquals("incorrect number of elements", 1, json.length());
       JSONObject info = json.getJSONObject("jobCounters");
-      verifyHsJobCounters(info, jobsMap.get(id));
+      verifyHsJobCounters(info, appContext.getJob(id));
     }
   }
 
@@ -631,7 +633,7 @@ public class TestHsWebServicesJobs extends JerseyTest {
       JSONObject json = response.getEntity(JSONObject.class);
       assertEquals("incorrect number of elements", 1, json.length());
       JSONObject info = json.getJSONObject("jobCounters");
-      verifyHsJobCounters(info, jobsMap.get(id));
+      verifyHsJobCounters(info, appContext.getJob(id));
     }
   }
   
@@ -689,7 +691,7 @@ public class TestHsWebServicesJobs extends JerseyTest {
       JSONObject json = response.getEntity(JSONObject.class);
       assertEquals("incorrect number of elements", 1, json.length());
       JSONObject info = json.getJSONObject("jobCounters");
-      verifyHsJobCounters(info, jobsMap.get(id));
+      verifyHsJobCounters(info, appContext.getJob(id));
     }
   }
 
@@ -711,7 +713,7 @@ public class TestHsWebServicesJobs extends JerseyTest {
       is.setCharacterStream(new StringReader(xml));
       Document dom = db.parse(is);
       NodeList info = dom.getElementsByTagName("jobCounters");
-      verifyHsJobCountersXML(info, jobsMap.get(id));
+      verifyHsJobCountersXML(info, appContext.getJob(id));
     }
   }
 

+ 2 - 2
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/src/test/java/org/apache/hadoop/mapreduce/v2/hs/webapp/TestHsWebServicesJobsQuery.java

@@ -284,9 +284,9 @@ public class TestHsWebServicesJobsQuery extends JerseyTest {
     String type = exception.getString("exception");
     String classname = exception.getString("javaClassName");
     WebServicesTestUtils
-        .checkStringMatch(
+        .checkStringContains(
             "exception message",
-            "No enum const class org.apache.hadoop.mapreduce.v2.api.records.JobState.InvalidState",
+            "org.apache.hadoop.mapreduce.v2.api.records.JobState.InvalidState",
             message);
     WebServicesTestUtils.checkStringMatch("exception type",
         "IllegalArgumentException", type);

+ 4 - 3
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/java/org/apache/hadoop/mapred/pipes/TestPipeApplication.java

@@ -47,6 +47,7 @@ import org.apache.hadoop.io.Writable;
 import org.apache.hadoop.io.WritableComparable;
 import org.apache.hadoop.mapred.IFile.Writer;
 import org.apache.hadoop.mapreduce.MRJobConfig;
+import org.apache.hadoop.mapreduce.security.TokenCache;
 import org.apache.hadoop.mapred.Counters;
 import org.apache.hadoop.mapred.Counters.Counter;
 import org.apache.hadoop.mapred.Counters.Group;
@@ -106,7 +107,7 @@ public class TestPipeApplication {
       Token<ApplicationTokenIdentifier> token = new Token<ApplicationTokenIdentifier>(
               "user".getBytes(), "password".getBytes(), new Text("kind"), new Text(
               "service"));
-      conf.getCredentials().addToken(new Text("ShuffleAndJobToken"), token);
+      TokenCache.setJobToken(token,  conf.getCredentials());
       conf.setBoolean(MRJobConfig.SKIP_RECORDS, true);
       TestTaskReporter reporter = new TestTaskReporter();
       PipesMapRunner<FloatWritable, NullWritable, IntWritable, Text> runner = new PipesMapRunner<FloatWritable, NullWritable, IntWritable, Text>();
@@ -171,7 +172,7 @@ public class TestPipeApplication {
               "user".getBytes(), "password".getBytes(), new Text("kind"), new Text(
               "service"));
 
-      conf.getCredentials().addToken(new Text("ShuffleAndJobToken"), token);
+      TokenCache.setJobToken(token, conf.getCredentials());
       FakeCollector output = new FakeCollector(new Counters.Counter(),
               new Progress());
       FileSystem fs = new RawLocalFileSystem();
@@ -391,7 +392,7 @@ public class TestPipeApplication {
       Token<ApplicationTokenIdentifier> token = new Token<ApplicationTokenIdentifier>(
               "user".getBytes(), "password".getBytes(), new Text("kind"), new Text(
               "service"));
-      conf.getCredentials().addToken(new Text("ShuffleAndJobToken"), token);
+      TokenCache.setJobToken(token, conf.getCredentials());
 
       File fCommand = getFileCommand("org.apache.hadoop.mapred.pipes.PipeReducerStub");
       conf.set(MRJobConfig.CACHE_LOCALFILES, fCommand.getAbsolutePath());

+ 2 - 2
hadoop-maven-plugins/src/main/java/org/apache/hadoop/maven/plugin/protoc/ProtocMojo.java

@@ -37,7 +37,7 @@ public class ProtocMojo extends AbstractMojo {
   private MavenProject project;
 
   @Parameter
-  private List<File> imports;
+  private File[] imports;
 
   @Parameter(defaultValue="${project.build.directory}/generated-sources/java")
   private File output;
@@ -83,4 +83,4 @@ public class ProtocMojo extends AbstractMojo {
     project.addCompileSourceRoot(output.getAbsolutePath());
   }
 
-}
+}

+ 1 - 10
hadoop-project/pom.xml

@@ -46,7 +46,7 @@
 
     <hadoop.assemblies.version>${project.version}</hadoop.assemblies.version>
 
-    <commons-daemon.version>1.0.3</commons-daemon.version>
+    <commons-daemon.version>1.0.13</commons-daemon.version>
 
     <test.build.dir>${project.build.directory}/test-dir</test.build.dir>
     <test.build.data>${test.build.dir}</test.build.data>
@@ -864,15 +864,6 @@
       <properties>
         <build.platform>${os.name}-${os.arch}-${sun.arch.data.model}</build.platform>
       </properties>
-      <dependencies>
-        <dependency>
-          <groupId>jdk.tools</groupId>
-          <artifactId>jdk.tools</artifactId>
-          <version>1.6</version>
-          <scope>system</scope>
-          <systemPath>${java.home}/../lib/tools.jar</systemPath>
-        </dependency>
-      </dependencies>
     </profile>
     <profile>
       <id>os.mac</id>

+ 3 - 3
hadoop-tools/hadoop-gridmix/src/test/java/org/apache/hadoop/mapred/gridmix/TestGridmixSummary.java

@@ -257,7 +257,7 @@ public class TestGridmixSummary {
                  qPath.toString(), es.getInputTraceLocation());
     // test expected data size
     assertEquals("Mismatch in expected data size", 
-                 "1.0k", es.getExpectedDataSize());
+                 "1 K", es.getExpectedDataSize());
     // test input data statistics
     assertEquals("Mismatch in input data statistics", 
                  ExecutionSummarizer.stringifyDataStatistics(dataStats), 
@@ -272,7 +272,7 @@ public class TestGridmixSummary {
     es.finalize(factory, testTraceFile.toString(), 1024*1024*1024*10L, resolver,
                 dataStats, conf);
     assertEquals("Mismatch in expected data size", 
-                 "10.0g", es.getExpectedDataSize());
+                 "10 G", es.getExpectedDataSize());
     
     // test trace signature uniqueness
     //  touch the trace file
@@ -389,4 +389,4 @@ public class TestGridmixSummary {
     assertEquals("Cluster summary test failed!", 0, 
                  cs.getNumBlacklistedTrackers());
   }
-}
+}

+ 11 - 0
hadoop-yarn-project/CHANGES.txt

@@ -66,6 +66,12 @@ Release 2.0.5-beta - UNRELEASED
     the per-application page are translated to html line breaks. (Omkar Vinit
     Joshi via vinodkv)
 
+    YARN-198. Added a link to RM pages from the NodeManager web app. (Jian He
+    via vinodkv)
+
+    YARN-237. Refreshing the RM page forgets how many rows I had in my
+    Datatables (jian he via bobby)
+
   OPTIMIZATIONS
 
   BUG FIXES
@@ -91,6 +97,9 @@ Release 2.0.5-beta - UNRELEASED
     YARN-376. Fixes a bug which would prevent the NM knowing about completed
     containers and applications. (Jason Lowe via sseth)
 
+    YARN-196. Nodemanager should be more robust in handling connection failure
+    to ResourceManager when a cluster is started (Xuan Gong via hitesh)
+
 Release 2.0.4-alpha - UNRELEASED
 
   INCOMPATIBLE CHANGES
@@ -396,6 +405,8 @@ Release 0.23.7 - UNRELEASED
     YARN-443. allow OS scheduling priority of NM to be different than the 
     containers it launches (tgraves)
 
+    YARN-468. coverage fix for org.apache.hadoop.yarn.server.webproxy.amfilter
+    (Aleksey Gorshkov via bobby)
 
   OPTIMIZATIONS
 

+ 14 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java

@@ -621,6 +621,20 @@ public class YarnConfiguration extends Configuration {
   public static final long DEFAULT_NM_PROCESS_KILL_WAIT_MS =
       2000;
 
+  /** Max time to wait to establish a connection to RM when NM starts
+   */
+  public static final String RESOURCEMANAGER_CONNECT_WAIT_SECS =
+      NM_PREFIX + "resourcemanager.connect.wait.secs";
+  public static final int DEFAULT_RESOURCEMANAGER_CONNECT_WAIT_SECS =
+      15*60;
+
+  /** Time interval between each NM attempt to connect to RM
+   */
+  public static final String RESOURCEMANAGER_CONNECT_RETRY_INTERVAL_SECS =
+      NM_PREFIX + "resourcemanager.connect.retry_interval.secs";
+  public static final long DEFAULT_RESOURCEMANAGER_CONNECT_RETRY_INTERVAL_SECS
+      = 30;
+
   /**
    * CLASSPATH for YARN applications. A comma-separated list of CLASSPATH
    * entries

+ 14 - 2
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/webapp/view/JQueryUI.java

@@ -107,12 +107,21 @@ public class JQueryUI extends HtmlBlock {
 
   protected void initDataTables(List<String> list) {
     String defaultInit = "{bJQueryUI: true, sPaginationType: 'full_numbers'}";
+    String stateSaveInit = "bStateSave : true, " +
+          "\"fnStateSave\": function (oSettings, oData) { " +
+              "sessionStorage.setItem( oSettings.sTableId, JSON.stringify(oData) ); }, " +
+          "\"fnStateLoad\": function (oSettings) { " +
+              "return JSON.parse( sessionStorage.getItem(oSettings.sTableId) );}, ";
+      
     for (String id : split($(DATATABLES_ID))) {
       if (Html.isValidId(id)) {
         String init = $(initID(DATATABLES, id));
         if (init.isEmpty()) {
           init = defaultInit;
         }
+        // for inserting stateSaveInit
+        int pos = init.indexOf('{') + 1;  
+        init = new StringBuffer(init).insert(pos, stateSaveInit).toString(); 
         list.add(join(id,"DataTable =  $('#", id, "').dataTable(", init,
                       ").fnSetFilteringDelay(188);"));
         String postInit = $(postInitID(DATATABLES, id));
@@ -126,9 +135,12 @@ public class JQueryUI extends HtmlBlock {
       String init = $(initSelector(DATATABLES));
       if (init.isEmpty()) {
         init = defaultInit;
-      }
+      }      
+      int pos = init.indexOf('{') + 1;  
+      init = new StringBuffer(init).insert(pos, stateSaveInit).toString();  
       list.add(join("  $('", escapeJavaScript(selector), "').dataTable(", init,
-               ").fnSetFilteringDelay(288);"));
+               ").fnSetFilteringDelay(288);"));      
+      
     }
   }
 

+ 14 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml

@@ -597,6 +597,20 @@
     <value>2000</value>
   </property>
 
+  <property>
+    <description>Max time, in seconds, to wait to establish a connection to RM when NM starts.
+    The NM will shutdown if it cannot connect to RM within the specified max time period.
+    If the value is set as -1, then NM will retry forever.</description>
+    <name>yarn.nodemanager.resourcemanager.connect.wait.secs</name>
+    <value>900</value>
+  </property>
+
+  <property>
+    <description>Time interval, in seconds, between each NM attempt to connect to RM.</description>
+    <name>yarn.nodemanager.resourcemanager.connect.retry_interval.secs</name>
+    <value>30</value>
+  </property>
+
   <!--Map Reduce configuration-->
   <property>
     <name>yarn.nodemanager.aux-services.mapreduce.shuffle.class</name>

+ 1 - 1
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java

@@ -350,7 +350,7 @@ public class NodeManager extends CompositeService
   ContainerManagerImpl getContainerManager() {
     return containerManager;
   }
-
+  
   public static void main(String[] args) {
     Thread.setDefaultUncaughtExceptionHandler(new YarnUncaughtExceptionHandler());
     StringUtils.startupShutdownMessage(NodeManager.class, args, LOG);

+ 75 - 7
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java

@@ -151,7 +151,6 @@ public class NodeStatusUpdaterImpl extends AbstractService implements
         YarnConfiguration.DEFAULT_NM_WEBAPP_ADDRESS,
         YarnConfiguration.DEFAULT_NM_WEBAPP_PORT);
     try {
-      //      this.hostName = InetAddress.getLocalHost().getCanonicalHostName();
       this.httpPort = httpBindAddress.getPort();
       // Registration has to be in start so that ContainerManager can get the
       // perNM tokens needed to authenticate ContainerTokens.
@@ -189,15 +188,84 @@ public class NodeStatusUpdaterImpl extends AbstractService implements
   }
 
   private void registerWithRM() throws YarnRemoteException {
-    this.resourceTracker = getRMClient();
-    LOG.info("Connecting to ResourceManager at " + this.rmAddress);
-    
-    RegisterNodeManagerRequest request = recordFactory.newRecordInstance(RegisterNodeManagerRequest.class);
+    Configuration conf = getConfig();
+    long rmConnectWaitMS =
+        conf.getInt(
+            YarnConfiguration.RESOURCEMANAGER_CONNECT_WAIT_SECS,
+            YarnConfiguration.DEFAULT_RESOURCEMANAGER_CONNECT_WAIT_SECS)
+        * 1000;
+    long rmConnectionRetryIntervalMS =
+        conf.getLong(
+            YarnConfiguration.RESOURCEMANAGER_CONNECT_RETRY_INTERVAL_SECS,
+            YarnConfiguration
+                .DEFAULT_RESOURCEMANAGER_CONNECT_RETRY_INTERVAL_SECS)
+        * 1000;
+
+    if(rmConnectionRetryIntervalMS < 0) {
+      throw new YarnException("Invalid Configuration. " +
+          YarnConfiguration.RESOURCEMANAGER_CONNECT_RETRY_INTERVAL_SECS +
+          " should not be negative.");
+    }
+
+    boolean waitForEver = (rmConnectWaitMS == -1000);
+
+    if(! waitForEver) {
+      if(rmConnectWaitMS < 0) {
+          throw new YarnException("Invalid Configuration. " +
+              YarnConfiguration.RESOURCEMANAGER_CONNECT_WAIT_SECS +
+              " can be -1, but can not be other negative numbers");
+      }
+
+      //try connect once
+      if(rmConnectWaitMS < rmConnectionRetryIntervalMS) {
+        LOG.warn(YarnConfiguration.RESOURCEMANAGER_CONNECT_WAIT_SECS
+            + " is smaller than "
+            + YarnConfiguration.RESOURCEMANAGER_CONNECT_RETRY_INTERVAL_SECS
+            + ". Only try connect once.");
+        rmConnectWaitMS = 0;
+      }
+    }
+
+    int rmRetryCount = 0;
+    long waitStartTime = System.currentTimeMillis();
+
+    RegisterNodeManagerRequest request =
+        recordFactory.newRecordInstance(RegisterNodeManagerRequest.class);
     request.setHttpPort(this.httpPort);
     request.setResource(this.totalResource);
     request.setNodeId(this.nodeId);
-    RegistrationResponse regResponse =
-        this.resourceTracker.registerNodeManager(request).getRegistrationResponse();
+    RegistrationResponse regResponse;
+
+    while(true) {
+      try {
+        rmRetryCount++;
+        LOG.info("Connecting to ResourceManager at " + this.rmAddress
+            + ". current no. of attempts is " + rmRetryCount);
+        this.resourceTracker = getRMClient();
+        regResponse =
+            this.resourceTracker.registerNodeManager(request)
+                .getRegistrationResponse();
+        break;
+      } catch(Throwable e) {
+        LOG.warn("Trying to connect to ResourceManager, " +
+            "current no. of failed attempts is "+rmRetryCount);
+        if(System.currentTimeMillis() - waitStartTime < rmConnectWaitMS
+            || waitForEver) {
+          try {
+            LOG.info("Sleeping for " + rmConnectionRetryIntervalMS/1000
+                + " seconds before next connection retry to RM");
+            Thread.sleep(rmConnectionRetryIntervalMS);
+          } catch(InterruptedException ex) {
+            //done nothing
+          }
+        } else {
+          String errorMessage = "Failed to Connect to RM, " +
+              "no. of failed attempts is "+rmRetryCount;
+          LOG.error(errorMessage,e);
+          throw new YarnException(errorMessage,e);
+        }
+      }
+    }
     // if the Resourcemanager instructs NM to shutdown.
     if (NodeAction.SHUTDOWN.equals(regResponse.getNodeAction())) {
       throw new YarnException(

+ 19 - 3
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/NavBlock.java

@@ -18,16 +18,32 @@
 
 package org.apache.hadoop.yarn.server.nodemanager.webapp;
 
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.yarn.conf.YarnConfiguration;
 import org.apache.hadoop.yarn.webapp.YarnWebParams;
 import org.apache.hadoop.yarn.webapp.view.HtmlBlock;
 
+import com.google.inject.Inject;
+
 public class NavBlock extends HtmlBlock implements YarnWebParams {
 
+  private Configuration conf;
+
+  @Inject
+  public NavBlock(Configuration conf) {
+	 this.conf = conf;
+  }
+  
   @Override
   protected void render(Block html) {
-    html
+	
+	String RMWebAppURL = YarnConfiguration.getRMWebAppURL(this.conf);
+	html
       .div("#nav")
-        .h3()._("NodeManager")._() // TODO: Problem if no header like this
+      .h3()._("ResourceManager")._()
+        .ul()
+          .li().a(RMWebAppURL, "RM Home")._()._()
+      .h3()._("NodeManager")._() // TODO: Problem if no header like this
         .ul()
           .li()
             .a(url("node"), "Node Information")._()
@@ -37,7 +53,7 @@ public class NavBlock extends HtmlBlock implements YarnWebParams {
           .li()
             .a(url("allContainers"), "List of Containers")._()
         ._()
-        .h3("Tools")
+      .h3("Tools")
         .ul()
           .li().a("/conf", "Configuration")._()
           .li().a("/logs", "Local logs")._()

+ 1 - 1
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/NodePage.java

@@ -42,7 +42,7 @@ public class NodePage extends NMView {
   protected void commonPreHead(HTML<_> html) {
     super.commonPreHead(html);
 
-    set(initID(ACCORDION, "nav"), "{autoHeight:false, active:0}");
+    set(initID(ACCORDION, "nav"), "{autoHeight:false, active:1}");
   }
 
   @Override

+ 97 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdater.java

@@ -267,6 +267,36 @@ public class TestNodeStatusUpdater {
     }
   }
 
+  private class MyNodeStatusUpdater4 extends NodeStatusUpdaterImpl {
+    public ResourceTracker resourceTracker =
+        new MyResourceTracker(this.context);
+    private Context context;
+    private final long waitStartTime;
+    private final long rmStartIntervalMS;
+    private final boolean rmNeverStart;
+
+    public MyNodeStatusUpdater4(Context context, Dispatcher dispatcher,
+        NodeHealthCheckerService healthChecker, NodeManagerMetrics metrics,
+        long rmStartIntervalMS, boolean rmNeverStart) {
+      super(context, dispatcher, healthChecker, metrics);
+      this.context = context;
+      this.waitStartTime = System.currentTimeMillis();
+      this.rmStartIntervalMS = rmStartIntervalMS;
+      this.rmNeverStart = rmNeverStart;
+    }
+
+    @Override
+    protected ResourceTracker getRMClient() {
+      if(System.currentTimeMillis() - waitStartTime <= rmStartIntervalMS
+          || rmNeverStart) {
+        throw new YarnException("Faking RM start failure as start " +
+            "delay timer has not expired.");
+      } else {
+        return resourceTracker;
+      }
+    }
+  }
+
   private class MyNodeManager extends NodeManager {
     
     private MyNodeStatusUpdater3 nodeStatusUpdater;
@@ -580,6 +610,73 @@ public class TestNodeStatusUpdater {
         + "Recieved SHUTDOWN signal from Resourcemanager ,Registration of NodeManager failed");
   }
 
+  @Test (timeout = 15000)
+  public void testNMConnectionToRM() {
+    final long delta = 1500;
+    final long connectionWaitSecs = 5;
+    final long connectionRetryIntervalSecs = 1;
+    //Waiting for rmStartIntervalMS, RM will be started
+    final long rmStartIntervalMS = 2*1000;
+    YarnConfiguration conf = createNMConfig();
+    conf.setLong(YarnConfiguration.RESOURCEMANAGER_CONNECT_WAIT_SECS,
+        connectionWaitSecs);
+    conf.setLong(YarnConfiguration
+        .RESOURCEMANAGER_CONNECT_RETRY_INTERVAL_SECS,
+        connectionRetryIntervalSecs);
+
+    //Test NM try to connect to RM Several times, but finally fail
+    nm = new NodeManager() {
+      @Override
+      protected NodeStatusUpdater createNodeStatusUpdater(Context context,
+          Dispatcher dispatcher, NodeHealthCheckerService healthChecker) {
+        NodeStatusUpdater nodeStatusUpdater = new MyNodeStatusUpdater4(
+            context, dispatcher, healthChecker, metrics,
+            rmStartIntervalMS, true);
+        return nodeStatusUpdater;
+      }
+    };
+    nm.init(conf);
+    long waitStartTime = System.currentTimeMillis();
+    try {
+      nm.start();
+      Assert.fail("NM should have failed to start due to RM connect failure");
+    } catch(Exception e) {
+      Assert.assertTrue("NM should have tried re-connecting to RM during " +
+          "period of at least " + connectionWaitSecs + " seconds, but " +
+          "stopped retrying within " + (connectionWaitSecs + delta/1000) +
+          " seconds", (System.currentTimeMillis() - waitStartTime
+              >= connectionWaitSecs*1000) && (System.currentTimeMillis()
+              - waitStartTime < (connectionWaitSecs*1000+delta)));
+    }
+
+    //Test NM connect to RM, fail at first several attempts,
+    //but finally success.
+    nm = new NodeManager() {
+      @Override
+      protected NodeStatusUpdater createNodeStatusUpdater(Context context,
+          Dispatcher dispatcher, NodeHealthCheckerService healthChecker) {
+        NodeStatusUpdater nodeStatusUpdater = new MyNodeStatusUpdater4(
+            context, dispatcher, healthChecker, metrics, rmStartIntervalMS,
+            false);
+        return nodeStatusUpdater;
+      }
+    };
+
+    nm.init(conf);
+    waitStartTime = System.currentTimeMillis();
+    try {
+      nm.start();
+    } catch (Exception ex){
+      Assert.fail("NM should have started successfully " +
+          "after connecting to RM.");
+    }
+    Assert.assertTrue("NM should have connected to RM within " + delta/1000
+        +" seconds of RM starting up.",
+        (System.currentTimeMillis() - waitStartTime >= rmStartIntervalMS)
+        && (System.currentTimeMillis() - waitStartTime
+        < (rmStartIntervalMS+delta)));
+  }
+
   /**
    * Verifies that if for some reason NM fails to start ContainerManager RPC
    * server, RM is oblivious to NM's presence. The behaviour is like this

+ 110 - 27
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-web-proxy/src/test/java/org/apache/hadoop/yarn/server/webproxy/amfilter/TestAmFilter.java

@@ -19,41 +19,39 @@
 package org.apache.hadoop.yarn.server.webproxy.amfilter;
 
 import java.io.IOException;
-import java.util.Collections;
-import java.util.Enumeration;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Map;
-import java.util.Set;
+import java.util.*;
 import java.util.concurrent.atomic.AtomicBoolean;
 
-import javax.servlet.Filter;
-import javax.servlet.FilterChain;
-import javax.servlet.FilterConfig;
-import javax.servlet.ServletContext;
-import javax.servlet.ServletException;
-import javax.servlet.ServletRequest;
-import javax.servlet.ServletResponse;
+import javax.servlet.*;
+import javax.servlet.http.Cookie;
 import javax.servlet.http.HttpServletRequest;
 import javax.servlet.http.HttpServletResponse;
 
-import junit.framework.Assert;
+import static junit.framework.Assert.*;
 
+import org.apache.hadoop.yarn.server.webproxy.WebAppProxyServlet;
+import org.glassfish.grizzly.servlet.HttpServletResponseImpl;
 import org.junit.Test;
 import org.mockito.Mockito;
 
+/**
+ * Test AmIpFilter. Requests to a no declared hosts should has way through
+ * proxy. Another requests can be filtered with (without) user name.
+ * 
+ */
+public class TestAmFilter {
 
-public class TestAmFilter  {
-
-  private String proxyHost = "bogushost.com";
+  private String proxyHost = "localhost";
   private String proxyUri = "http://bogus";
+  private String doFilterRequest;
+  private AmIpServletRequestWrapper servletWrapper;
 
   private class TestAmIpFilter extends AmIpFilter {
 
     private Set<String> proxyAddresses = null;
 
     protected Set<String> getProxyAddresses() {
-      if(proxyAddresses == null) {
+      if (proxyAddresses == null) {
         proxyAddresses = new HashSet<String>();
       }
       proxyAddresses.add(proxyHost);
@@ -61,12 +59,10 @@ public class TestAmFilter  {
     }
   }
 
-
   private static class DummyFilterConfig implements FilterConfig {
     final Map<String, String> map;
 
-
-    DummyFilterConfig(Map<String,String> map) {
+    DummyFilterConfig(Map<String, String> map) {
       this.map = map;
     }
 
@@ -74,22 +70,24 @@ public class TestAmFilter  {
     public String getFilterName() {
       return "dummy";
     }
+
     @Override
     public String getInitParameter(String arg0) {
       return map.get(arg0);
     }
+
     @Override
     public Enumeration<String> getInitParameterNames() {
       return Collections.enumeration(map.keySet());
     }
+
     @Override
     public ServletContext getServletContext() {
       return null;
     }
   }
 
-
-  @Test
+  @Test(timeout = 5000)
   public void filterNullCookies() throws Exception {
     HttpServletRequest request = Mockito.mock(HttpServletRequest.class);
 
@@ -97,13 +95,12 @@ public class TestAmFilter  {
     Mockito.when(request.getRemoteAddr()).thenReturn(proxyHost);
 
     HttpServletResponse response = Mockito.mock(HttpServletResponse.class);
-
     final AtomicBoolean invoked = new AtomicBoolean();
 
     FilterChain chain = new FilterChain() {
       @Override
-      public void doFilter(ServletRequest servletRequest, ServletResponse servletResponse)
-        throws IOException, ServletException {
+      public void doFilter(ServletRequest servletRequest,
+          ServletResponse servletResponse) throws IOException, ServletException {
         invoked.set(true);
       }
     };
@@ -115,7 +112,93 @@ public class TestAmFilter  {
     Filter filter = new TestAmIpFilter();
     filter.init(conf);
     filter.doFilter(request, response, chain);
-    Assert.assertTrue(invoked.get());
+    assertTrue(invoked.get());
     filter.destroy();
   }
+
+  /**
+   * Test AmIpFilter
+   */
+  @Test(timeout = 1000)
+  public void testFilter() throws Exception {
+    Map<String, String> params = new HashMap<String, String>();
+    params.put(AmIpFilter.PROXY_HOST, proxyHost);
+    params.put(AmIpFilter.PROXY_URI_BASE, proxyUri);
+    FilterConfig config = new DummyFilterConfig(params);
+
+    // dummy filter
+    FilterChain chain = new FilterChain() {
+      @Override
+      public void doFilter(ServletRequest servletRequest,
+          ServletResponse servletResponse) throws IOException, ServletException {
+        doFilterRequest = servletRequest.getClass().getName();
+        if (servletRequest instanceof AmIpServletRequestWrapper) {
+          servletWrapper = (AmIpServletRequestWrapper) servletRequest;
+
+        }
+      }
+    };
+    AmIpFilter testFilter = new AmIpFilter();
+    testFilter.init(config);
+
+    HttpServletResponseForTest response = new HttpServletResponseForTest();
+    // Test request should implements HttpServletRequest
+
+    ServletRequest failRequest = Mockito.mock(ServletRequest.class);
+    try {
+      testFilter.doFilter(failRequest, response, chain);
+      fail();
+    } catch (ServletException e) {
+      assertEquals("This filter only works for HTTP/HTTPS", e.getMessage());
+    }
+
+    // request with HttpServletRequest
+    HttpServletRequest request = Mockito.mock(HttpServletRequest.class);
+    Mockito.when(request.getRemoteAddr()).thenReturn("redirect");
+    Mockito.when(request.getRequestURI()).thenReturn("/redirect");
+    testFilter.doFilter(request, response, chain);
+    // address "redirect" is not in host list
+    assertEquals("http://bogus/redirect", response.getRedirect());
+    // "127.0.0.1" contains in host list. Without cookie
+    Mockito.when(request.getRemoteAddr()).thenReturn("127.0.0.1");
+    testFilter.doFilter(request, response, chain);
+
+    assertTrue(doFilterRequest
+        .contains("javax.servlet.http.HttpServletRequest"));
+    // cookie added
+    Cookie[] cookies = new Cookie[1];
+    cookies[0] = new Cookie(WebAppProxyServlet.PROXY_USER_COOKIE_NAME, "user");
+
+    Mockito.when(request.getCookies()).thenReturn(cookies);
+    testFilter.doFilter(request, response, chain);
+
+    assertEquals(
+        "org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpServletRequestWrapper",
+        doFilterRequest);
+    // request contains principal from cookie
+    assertEquals("user", servletWrapper.getUserPrincipal().getName());
+    assertEquals("user", servletWrapper.getRemoteUser());
+    assertFalse(servletWrapper.isUserInRole(""));
+
+  }
+
+  private class HttpServletResponseForTest extends HttpServletResponseImpl {
+    String redirectLocation = "";
+
+    public String getRedirect() {
+      return redirectLocation;
+    }
+
+    @Override
+    public void sendRedirect(String location) throws IOException {
+      redirectLocation = location;
+    }
+
+    @Override
+    public String encodeRedirectURL(String url) {
+      return url;
+    }
+
+  }
+
 }