Bladeren bron

Merging r1566359 through r1568420 from trunk.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/HDFS-5535@1568437 13f79535-47bb-0310-9956-ffa450edef68
Jing Zhao 11 jaren geleden
bovenliggende
commit
ba4b10354c
100 gewijzigde bestanden met toevoegingen van 10733 en 2246 verwijderingen
  1. 29 3
      hadoop-common-project/hadoop-common/CHANGES.txt
  2. 2950 0
      hadoop-common-project/hadoop-common/src/main/docs/releasenotes.html
  3. 22 6
      hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/Globber.java
  4. 6 0
      hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/s3/S3FileSystem.java
  5. 6 0
      hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/s3native/NativeS3FileSystem.java
  6. 6 2
      hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/retry/LossyRetryInvocationHandler.java
  7. 1 1
      hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/Client.java
  8. 4 2
      hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/Server.java
  9. 7 4
      hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/net/unix/DomainSocketWatcher.java
  10. 15 0
      hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/Groups.java
  11. 1 1
      hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/LdapGroupsMapping.java
  12. 4 2
      hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/authorize/ServiceAuthorizationManager.java
  13. 53 0
      hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/Waitable.java
  14. 1 1
      hadoop-common-project/hadoop-common/src/main/java/overview.html
  15. 6 1
      hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/s3/S3FileSystemContractBaseTest.java
  16. 7 2
      hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/s3native/NativeS3FileSystemContractBaseTest.java
  17. 4 3
      hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/net/unix/TestDomainSocketWatcher.java
  18. 4 2
      hadoop-hdfs-project/hadoop-hdfs-nfs/src/main/java/org/apache/hadoop/hdfs/nfs/nfs3/RpcProgramNfs3.java
  19. 144 28
      hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
  20. 3 0
      hadoop-hdfs-project/hadoop-hdfs/dev-support/findbugsExcludeFile.xml
  21. 1 0
      hadoop-hdfs-project/hadoop-hdfs/pom.xml
  22. 1 1
      hadoop-hdfs-project/hadoop-hdfs/src/main/bin/hdfs
  23. 1 3
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/BlockReader.java
  24. 676 164
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/BlockReaderFactory.java
  25. 15 70
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/BlockReaderLocal.java
  26. 11 11
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/BlockReaderLocalLegacy.java
  27. 204 0
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/ClientContext.java
  28. 52 74
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSClient.java
  29. 10 6
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java
  30. 56 227
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSInputStream.java
  31. 76 39
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DomainSocketFactory.java
  32. 75 0
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/ExtendedBlockId.java
  33. 0 287
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/FileInputStreamCache.java
  34. 3 26
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/PeerCache.java
  35. 1 3
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/RemoteBlockReader.java
  36. 1 3
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/RemoteBlockReader2.java
  37. 37 0
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/RemotePeerFactory.java
  38. 19 94
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/client/ClientMmap.java
  39. 0 482
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/client/ClientMmapManager.java
  40. 881 0
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/client/ShortCircuitCache.java
  41. 268 0
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/client/ShortCircuitReplica.java
  42. 64 0
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/client/ShortCircuitReplicaInfo.java
  43. 2 1
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/LayoutVersion.java
  44. 3 2
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/DatanodeProtocolClientSideTranslatorPB.java
  45. 85 2
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/security/token/delegation/DelegationTokenSecretManager.java
  46. 5 3
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockPlacementPolicyDefault.java
  47. 96 89
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/common/JspHelper.java
  48. 6 2
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockMetadataHeader.java
  49. 1 1
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNode.java
  50. 32 72
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsDatasetCache.java
  51. 129 18
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/CacheManager.java
  52. 2 0
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ClusterJspHelper.java
  53. 6 0
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSClusterStats.java
  54. 2 3
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSImage.java
  55. 5 1
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSImageCompression.java
  56. 68 7
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSImageFormat.java
  57. 426 0
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSImageFormatPBINode.java
  58. 583 0
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSImageFormatProtobuf.java
  59. 93 0
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSImageUtil.java
  60. 28 1
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java
  61. 1 1
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/INodeDirectory.java
  62. 1 1
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/INodeFile.java
  63. 5 0
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/INodeMap.java
  64. 27 38
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNode.java
  65. 43 26
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NamenodeFsck.java
  66. 4 2
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/SaveNamespaceContext.java
  67. 5 1
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/snapshot/DirectoryWithSnapshotFeature.java
  68. 506 0
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/snapshot/FSImageFormatPBSnapshot.java
  69. 15 11
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/snapshot/SnapshotFSImageFormat.java
  70. 17 0
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/snapshot/SnapshotManager.java
  71. 19 5
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/web/resources/NamenodeWebHdfsMethods.java
  72. 160 0
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/offlineImageViewer/FileDistributionCalculator.java
  73. 250 0
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/offlineImageViewer/LsrPBImage.java
  74. 178 0
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/offlineImageViewer/OfflineImageViewerPB.java
  75. 433 0
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/offlineImageViewer/PBImageXmlWriter.java
  76. 284 0
      hadoop-hdfs-project/hadoop-hdfs/src/main/proto/fsimage.proto
  77. 59 1
      hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml
  78. 2 2
      hadoop-hdfs-project/hadoop-hdfs/src/main/webapps/hdfs/dfshealth.html
  79. 25 10
      hadoop-hdfs-project/hadoop-hdfs/src/main/webapps/hdfs/dfshealth.js
  80. 2 2
      hadoop-hdfs-project/hadoop-hdfs/src/main/webapps/hdfs/explorer.html
  81. 21 3
      hadoop-hdfs-project/hadoop-hdfs/src/main/webapps/hdfs/explorer.js
  82. 123 101
      hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/fs/TestEnhancedByteBufferAccess.java
  83. 29 0
      hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/fs/TestGlobPaths.java
  84. 60 13
      hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/BlockReaderTestUtil.java
  85. 32 5
      hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/DFSTestUtil.java
  86. 285 0
      hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestBlockReaderFactory.java
  87. 9 4
      hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestBlockReaderLocal.java
  88. 17 52
      hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestConnCache.java
  89. 22 20
      hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDataTransferKeepalive.java
  90. 2 1
      hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDisableConnCache.java
  91. 0 126
      hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestFileInputStreamCache.java
  92. 1 0
      hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestFileStatus.java
  93. 346 0
      hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestShortCircuitCache.java
  94. 16 21
      hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestShortCircuitLocalRead.java
  95. 64 31
      hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockTokenWithDFS.java
  96. 161 0
      hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestReplicationPolicyConsiderLoad.java
  97. 38 14
      hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeVolumeFailure.java
  98. 36 0
      hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestDeduplicationMap.java
  99. 138 0
      hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestFSImage.java
  100. 0 5
      hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestFSImageStorageInspector.java

+ 29 - 3
hadoop-common-project/hadoop-common/CHANGES.txt

@@ -312,6 +312,11 @@ Release 2.4.0 - UNRELEASED
     HADOOP-10295. Allow distcp to automatically identify the checksum type of 
     source files and use it for the target. (jing9 and Laurent Goujon)
 
+    HADOOP-10333. Fix grammatical error in overview.html document.
+    (René Nyffenegger via suresh)
+
+    HADOOP-10343. Change info to debug log in LossyRetryInvocationHandler. (arpit)
+
   OPTIMIZATIONS
 
   BUG FIXES
@@ -328,15 +333,36 @@ Release 2.4.0 - UNRELEASED
     HADOOP-10330. TestFrameDecoder fails if it cannot bind port 12345.
     (Arpit Agarwal)
 
-Release 2.3.0 - UNRELEASED
+    HADOOP-10326. M/R jobs can not access S3 if Kerberos is enabled. (bc Wong
+    via atm)
+
+    HADOOP-10338. Cannot get the FileStatus of the root inode from the new
+    Globber (cmccabe)
+
+    HADOOP-10249. LdapGroupsMapping should trim ldap password read from file.
+    (Dilli Armugam via suresh)
+
+Release 2.3.1 - UNRELEASED
 
   INCOMPATIBLE CHANGES
 
-    HADOOP-8545. Filesystem Implementation for OpenStack Swift
-    (Dmitry Mezhensky, David Dobbins, Stevel via stevel)
+  NEW FEATURES
+
+  IMPROVEMENTS
+
+  OPTIMIZATIONS
+
+  BUG FIXES 
+
+Release 2.3.0 - 2014-02-18
+
+  INCOMPATIBLE CHANGES
 
   NEW FEATURES
 
+    HADOOP-8545. Filesystem Implementation for OpenStack Swift
+    (Dmitry Mezhensky, David Dobbins, Stevel via stevel)
+
   IMPROVEMENTS
 
     HADOOP-10046. Print a log message when SSL is enabled.

+ 2950 - 0
hadoop-common-project/hadoop-common/src/main/docs/releasenotes.html

@@ -1,3 +1,2953 @@
+<META http-equiv="Content-Type" content="text/html; charset=UTF-8">
+<title>Hadoop  2.3.0 Release Notes</title>
+<STYLE type="text/css">
+	H1 {font-family: sans-serif}
+	H2 {font-family: sans-serif; margin-left: 7mm}
+	TABLE {margin-left: 7mm}
+</STYLE>
+</head>
+<body>
+<h1>Hadoop  2.3.0 Release Notes</h1>
+These release notes include new developer and user-facing incompatibilities, features, and major improvements. 
+<a name="changes"/>
+<h2>Changes since Hadoop 2.2.0</h2>
+<ul>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1642">YARN-1642</a>.
+     Blocker sub-task reported by Karthik Kambatla and fixed by Karthik Kambatla (resourcemanager)<br>
+     <b>RMDTRenewer#getRMClient should use ClientRMProxy</b><br>
+     <blockquote>RMDTRenewer#getRMClient gets a proxy to the RM in the conf directly instead of going through ClientRMProxy. 
+
+{code}
+      final YarnRPC rpc = YarnRPC.create(conf);
+      return (ApplicationClientProtocol)rpc.getProxy(ApplicationClientProtocol.class, addr, conf);
+{code}</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1630">YARN-1630</a>.
+     Major bug reported by Aditya Acharya and fixed by Aditya Acharya (client)<br>
+     <b>Introduce timeout for async polling operations in YarnClientImpl</b><br>
+     <blockquote>I ran an MR2 application that would have been long running, and killed it programmatically using a YarnClient. The app was killed, but the client hung forever. The message that I saw, which spammed the logs, was "Watiting for application application_1389036507624_0018 to be killed."
+
+The RM log indicated that the app had indeed transitioned from RUNNING to KILLED, but for some reason future responses to the RPC to kill the application did not indicate that the app had been terminated.
+
+I tracked this down to YarnClientImpl.java, and though I was unable to reproduce the bug, I wrote a patch to introduce a bound on the number of times that YarnClientImpl retries the RPC before giving up.</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1629">YARN-1629</a>.
+     Major bug reported by Sandy Ryza and fixed by Sandy Ryza (scheduler)<br>
+     <b>IndexOutOfBoundsException in Fair Scheduler MaxRunningAppsEnforcer</b><br>
+     <blockquote>This can occur when the second-to-last app in a queue's pending app list is made runnable.  The app is pulled out from under the iterator. </blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1628">YARN-1628</a>.
+     Major bug reported by Mit Desai and fixed by Vinod Kumar Vavilapalli <br>
+     <b>TestContainerManagerSecurity fails on trunk</b><br>
+     <blockquote>The Test fails with the following error
+
+{noformat}
+java.lang.IllegalArgumentException: java.net.UnknownHostException: InvalidHost
+	at org.apache.hadoop.security.SecurityUtil.buildTokenService(SecurityUtil.java:377)
+	at org.apache.hadoop.yarn.server.security.BaseNMTokenSecretManager.newInstance(BaseNMTokenSecretManager.java:145)
+	at org.apache.hadoop.yarn.server.security.BaseNMTokenSecretManager.createNMToken(BaseNMTokenSecretManager.java:136)
+	at org.apache.hadoop.yarn.server.TestContainerManagerSecurity.testNMTokens(TestContainerManagerSecurity.java:253)
+	at org.apache.hadoop.yarn.server.TestContainerManagerSecurity.testContainerManager(TestContainerManagerSecurity.java:144)
+{noformat}</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1624">YARN-1624</a>.
+     Major bug reported by Aditya Acharya and fixed by Aditya Acharya (scheduler)<br>
+     <b>QueuePlacementPolicy format is not easily readable via a JAXB parser</b><br>
+     <blockquote>The current format for specifying queue placement rules in the fair scheduler allocations file does not lend itself to easy parsing via a JAXB parser. In particular, relying on the tag name to encode information about which rule to use makes it very difficult for an xsd-based JAXB parser to preserve the order of the rules, which is essential.</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1623">YARN-1623</a>.
+     Major improvement reported by Sandy Ryza and fixed by Sandy Ryza (scheduler)<br>
+     <b>Include queue name in RegisterApplicationMasterResponse</b><br>
+     <blockquote>This provides the YARN change necessary to support MAPREDUCE-5732.</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1618">YARN-1618</a>.
+     Blocker sub-task reported by Karthik Kambatla and fixed by Karthik Kambatla (resourcemanager)<br>
+     <b>Fix invalid RMApp transition from NEW to FINAL_SAVING</b><br>
+     <blockquote>YARN-891 augments the RMStateStore to store information on completed applications. In the process, it adds transitions from NEW to FINAL_SAVING. This leads to the RM trying to update entries in the state-store that do not exist. On ZKRMStateStore, this leads to the RM crashing. 
+
+Previous description:
+ZKRMStateStore fails to handle updates to znodes that don't exist. For instance, this can happen when an app transitions from NEW to FINAL_SAVING. In these cases, the store should create the missing znode and handle the update.</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1616">YARN-1616</a>.
+     Trivial improvement reported by Karthik Kambatla and fixed by Karthik Kambatla (resourcemanager)<br>
+     <b>RMFatalEventDispatcher should log the cause of the event</b><br>
+     <blockquote>RMFatalEventDispatcher#handle() logs the receipt of an event and its type, but leaves out the cause. The cause captures why the event was raised and would help debugging issues. </blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1608">YARN-1608</a>.
+     Trivial bug reported by Karthik Kambatla and fixed by Karthik Kambatla (nodemanager)<br>
+     <b>LinuxContainerExecutor has a few DEBUG messages at INFO level</b><br>
+     <blockquote>LCE has a few INFO level log messages meant to be at debug level. In fact, they are logged both at INFO and DEBUG. </blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1607">YARN-1607</a>.
+     Major bug reported by Sandy Ryza and fixed by Sandy Ryza <br>
+     <b>TestRM expects the capacity scheduler</b><br>
+     <blockquote>We should either explicitly set the Capacity Scheduler or make it scheduler-agnostic</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1603">YARN-1603</a>.
+     Trivial bug reported by Zhijie Shen and fixed by Zhijie Shen <br>
+     <b>Remove two *.orig files which were unexpectedly committed</b><br>
+     <blockquote>FairScheduler.java.orig and TestFifoScheduler.java.orig</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1601">YARN-1601</a>.
+     Major bug reported by Alejandro Abdelnur and fixed by Alejandro Abdelnur <br>
+     <b>3rd party JARs are missing from hadoop-dist output</b><br>
+     <blockquote>With the build changes of YARN-888 we are leaving out all 3rd party JArs used directly by YARN under /share/hadoop/yarn/lib/.
+
+We did not notice this when running minicluster because they all happen to be in the classpath from hadoop-common and hadoop-yarn.
+
+As 3d party JARs are not 'public' interfaces we cannot rely on them being provided to yarn by common and hdfs. (ie if common and hdfs stop using a 3rd party dependency that yarn uses this would break yarn if yarn does not pull that dependency explicitly).
+
+Also, this will break bigtop hadoop build when they move to use branch-2 as they expect to find jars in /share/hadoop/yarn/lib/</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1600">YARN-1600</a>.
+     Blocker bug reported by Jason Lowe and fixed by Haohui Mai (resourcemanager)<br>
+     <b>RM does not startup when security is enabled without spnego configured</b><br>
+     <blockquote>We have a custom auth filter in front of our various UI pages that handles user authentication.  However currently the RM assumes that if security is enabled then the user must have configured spnego as well for the RM web pages which is not true in our case.</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1598">YARN-1598</a>.
+     Critical sub-task reported by Karthik Kambatla and fixed by Karthik Kambatla (client , resourcemanager)<br>
+     <b>HA-related rmadmin commands don't work on a secure cluster</b><br>
+     <blockquote>The HA-related commands like -getServiceState -checkHealth etc. don't work in a secure cluster.</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1579">YARN-1579</a>.
+     Trivial sub-task reported by Karthik Kambatla and fixed by Karthik Kambatla (resourcemanager)<br>
+     <b>ActiveRMInfoProto fields should be optional</b><br>
+     <blockquote>Per discussion on YARN-1568, ActiveRMInfoProto should have optional fields instead of required fields. </blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1575">YARN-1575</a>.
+     Critical sub-task reported by Jason Lowe and fixed by Jason Lowe (nodemanager)<br>
+     <b>Public localizer crashes with "Localized unkown resource"</b><br>
+     <blockquote>The public localizer can crash with the error:
+
+{noformat}
+2014-01-08 14:11:43,212 [Thread-467] ERROR org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ResourceLocalizationService: Localized unkonwn resource to java.util.concurrent.FutureTask@852e26
+2014-01-08 14:11:43,212 [Thread-467] INFO org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ResourceLocalizationService: Public cache exiting
+{noformat}</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1574">YARN-1574</a>.
+     Blocker sub-task reported by Xuan Gong and fixed by Xuan Gong <br>
+     <b>RMDispatcher should be reset on transition to standby</b><br>
+     <blockquote>Currently, we move rmDispatcher out of ActiveService. But we still register the Event dispatcher, such as schedulerDispatcher, RMAppEventDispatcher when we initiate the ActiveService.
+
+Almost every time when we transit RM from Active to Standby,  we need to initiate the ActiveService. That means we will register the same event Dispatcher which will cause the same event will be handled several times.</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1573">YARN-1573</a>.
+     Major sub-task reported by Karthik Kambatla and fixed by Karthik Kambatla (resourcemanager)<br>
+     <b>ZK store should use a private password for root-node-acls</b><br>
+     <blockquote>Currently, when HA is enabled, ZK store uses cluster-timestamp as the password for root node ACLs to give the Active RM exclusive access to the store. A more private value like a random number might be better. </blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1568">YARN-1568</a>.
+     Trivial task reported by Karthik Kambatla and fixed by Karthik Kambatla (resourcemanager)<br>
+     <b>Rename clusterid to clusterId in ActiveRMInfoProto </b><br>
+     <blockquote>YARN-1029 introduces ActiveRMInfoProto - just realized it defines a field clusterid, which is inconsistent with other fields. Better to fix it immediately than leave the inconsistency. </blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1567">YARN-1567</a>.
+     Major improvement reported by Sandy Ryza and fixed by Sandy Ryza (scheduler)<br>
+     <b>In Fair Scheduler, allow empty queues to change between leaf and parent on allocation file reload</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1560">YARN-1560</a>.
+     Major test reported by Ted Yu and fixed by Ted Yu <br>
+     <b>TestYarnClient#testAMMRTokens fails with null AMRM token</b><br>
+     <blockquote>The following can be reproduced locally:
+{code}
+testAMMRTokens(org.apache.hadoop.yarn.client.api.impl.TestYarnClient)  Time elapsed: 3.341 sec  &lt;&lt;&lt; FAILURE!
+junit.framework.AssertionFailedError: null
+  at junit.framework.Assert.fail(Assert.java:48)
+  at junit.framework.Assert.assertTrue(Assert.java:20)
+  at junit.framework.Assert.assertNotNull(Assert.java:218)
+  at junit.framework.Assert.assertNotNull(Assert.java:211)
+  at org.apache.hadoop.yarn.client.api.impl.TestYarnClient.testAMMRTokens(TestYarnClient.java:382)
+{code}
+This test didn't appear in https://builds.apache.org/job/Hadoop-Yarn-trunk/442/consoleFull</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1559">YARN-1559</a>.
+     Blocker sub-task reported by Karthik Kambatla and fixed by Karthik Kambatla (resourcemanager)<br>
+     <b>Race between ServerRMProxy and ClientRMProxy setting RMProxy#INSTANCE</b><br>
+     <blockquote>RMProxy#INSTANCE is a non-final static field and both ServerRMProxy and ClientRMProxy set it. This leads to races as witnessed on - YARN-1482.
+
+Sample trace:
+{noformat}
+java.lang.IllegalArgumentException: RM does not support this client protocol
+        at com.google.common.base.Preconditions.checkArgument(Preconditions.java:88)
+        at org.apache.hadoop.yarn.client.ClientRMProxy.checkAllowedProtocols(ClientRMProxy.java:119)
+        at org.apache.hadoop.yarn.client.ConfiguredRMFailoverProxyProvider.init(ConfiguredRMFailoverProxyProvider.java:58)
+        at org.apache.hadoop.yarn.client.RMProxy.createRMFailoverProxyProvider(RMProxy.java:158)
+        at org.apache.hadoop.yarn.client.RMProxy.createRMProxy(RMProxy.java:88)
+        at org.apache.hadoop.yarn.server.api.ServerRMProxy.createRMProxy(ServerRMProxy.java:56)
+{noformat}</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1549">YARN-1549</a>.
+     Major test reported by Ted Yu and fixed by haosdent <br>
+     <b>TestUnmanagedAMLauncher#testDSShell fails in trunk</b><br>
+     <blockquote>The following error is reproducible:
+{code}
+testDSShell(org.apache.hadoop.yarn.applications.unmanagedamlauncher.TestUnmanagedAMLauncher)  Time elapsed: 14.911 sec  &lt;&lt;&lt; ERROR!
+java.lang.RuntimeException: Failed to receive final expected state in ApplicationReport, CurrentState=RUNNING, ExpectedStates=FINISHED,FAILED,KILLED
+	at org.apache.hadoop.yarn.applications.unmanagedamlauncher.UnmanagedAMLauncher.monitorApplication(UnmanagedAMLauncher.java:447)
+	at org.apache.hadoop.yarn.applications.unmanagedamlauncher.UnmanagedAMLauncher.run(UnmanagedAMLauncher.java:352)
+	at org.apache.hadoop.yarn.applications.unmanagedamlauncher.TestUnmanagedAMLauncher.testDSShell(TestUnmanagedAMLauncher.java:147)
+{code}
+See https://builds.apache.org/job/Hadoop-Yarn-trunk/435</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1541">YARN-1541</a>.
+     Major bug reported by Jian He and fixed by Jian He <br>
+     <b>Invalidate AM Host/Port when app attempt is done so that in the mean-while client doesn&#8217;t get wrong information.</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1527">YARN-1527</a>.
+     Trivial bug reported by Jian He and fixed by Akira AJISAKA <br>
+     <b>yarn rmadmin command prints wrong usage info:</b><br>
+     <blockquote>The usage should be: yarn rmadmin, instead of java RMAdmin, and the -refreshQueues should be in the second line.
+{code} Usage: java RMAdmin   -refreshQueues 
+   -refreshNodes 
+   -refreshSuperUserGroupsConfiguration 
+   -refreshUserToGroupsMappings 
+   -refreshAdminAcls 
+   -refreshServiceAcl 
+   -getGroups [username]
+   -help [cmd]
+   -transitionToActive &lt;serviceId&gt;
+   -transitionToStandby &lt;serviceId&gt;
+   -failover [--forcefence] [--forceactive] &lt;serviceId&gt; &lt;serviceId&gt;
+   -getServiceState &lt;serviceId&gt;
+   -checkHealth &lt;serviceId&gt;
+{code}</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1523">YARN-1523</a>.
+     Major sub-task reported by Bikas Saha and fixed by Karthik Kambatla <br>
+     <b>Use StandbyException instead of RMNotYetReadyException</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1522">YARN-1522</a>.
+     Major bug reported by Liyin Liang and fixed by Liyin Liang <br>
+     <b>TestApplicationCleanup.testAppCleanup occasionally fails</b><br>
+     <blockquote>TestApplicationCleanup is occasionally failing with the error:
+{code}
+-------------------------------------------------------------------------------
+Test set: org.apache.hadoop.yarn.server.resourcemanager.TestApplicationCleanup
+-------------------------------------------------------------------------------
+Tests run: 1, Failures: 1, Errors: 0, Skipped: 0, Time elapsed: 6.215 sec &lt;&lt;&lt; FAILURE! - in org.apache.hadoop.yarn.server.resourcemanager.TestApplicationCleanup
+testAppCleanup(org.apache.hadoop.yarn.server.resourcemanager.TestApplicationCleanup) Time elapsed: 5.555 sec &lt;&lt;&lt; FAILURE!
+junit.framework.AssertionFailedError: expected:&lt;1&gt; but was:&lt;0&gt;
+at org.apache.hadoop.yarn.server.resourcemanager.TestApplicationCleanup.testAppCleanup(TestApplicationCleanup.java:119)
+{code}</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1505">YARN-1505</a>.
+     Blocker bug reported by Xuan Gong and fixed by Xuan Gong <br>
+     <b>WebAppProxyServer should not set localhost as YarnConfiguration.PROXY_ADDRESS by itself</b><br>
+     <blockquote>At WebAppProxyServer::startServer(), it will set up  YarnConfiguration.PROXY_ADDRESS to localhost:9099 by itself. So, no matter what is the value we set YarnConfiguration.PROXY_ADDRESS in configuration, the proxyserver will bind to localhost:9099</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1491">YARN-1491</a>.
+     Trivial bug reported by Jonathan Eagles and fixed by Chen He <br>
+     <b>Upgrade JUnit3 TestCase to JUnit 4</b><br>
+     <blockquote>There are still four references to test classes that extend from junit.framework.TestCase
+
+hadoop-yarn/hadoop-yarn-common/src/test/java/org/apache/hadoop/yarn/util/TestYarnVersionInfo.java
+hadoop-yarn/hadoop-yarn-common/src/test/java/org/apache/hadoop/yarn/util/TestWindowsResourceCalculatorPlugin.java
+hadoop-yarn/hadoop-yarn-common/src/test/java/org/apache/hadoop/yarn/util/TestLinuxResourceCalculatorPlugin.java
+hadoop-yarn/hadoop-yarn-common/src/test/java/org/apache/hadoop/yarn/util/TestWindowsBasedProcessTree.java
+</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1485">YARN-1485</a>.
+     Major sub-task reported by Xuan Gong and fixed by Xuan Gong <br>
+     <b>Enabling HA should verify the RM service addresses configurations have been set for every RM Ids defined in RM_HA_IDs</b><br>
+     <blockquote>After YARN-1325, the YarnConfiguration.RM_HA_IDS will contain multiple RM_Ids. We need to verify that the RM service addresses configurations have been set for all of RM_Ids.</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1482">YARN-1482</a>.
+     Major sub-task reported by Vinod Kumar Vavilapalli and fixed by Xuan Gong <br>
+     <b>WebApplicationProxy should be always-on w.r.t HA even if it is embedded in the RM</b><br>
+     <blockquote>This way, even if an RM goes to standby mode, we can affect a redirect to the active. And more importantly, users will not suddenly see all their links stop working.</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1481">YARN-1481</a>.
+     Major sub-task reported by Vinod Kumar Vavilapalli and fixed by Vinod Kumar Vavilapalli <br>
+     <b>Move internal services logic from AdminService to ResourceManager</b><br>
+     <blockquote>This is something I found while reviewing YARN-1318, but didn't halt that patch as many cycles went there already. Some top level issues
+ - Not easy to follow RM's service life cycle
+    -- RM adds only AdminService as its service directly.
+    -- Other services are added to RM when AdminService's init calls RM.activeServices.init()
+ - Overall, AdminService shouldn't encompass all of RM's HA state management. It was originally supposed to be the implementation of just the RPC server.</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1463">YARN-1463</a>.
+     Major test reported by Ted Yu and fixed by Vinod Kumar Vavilapalli <br>
+     <b>Tests should avoid starting http-server where possible or creates spnego keytab/principals</b><br>
+     <blockquote>Here is stack trace:
+{code}
+testContainerManager[1](org.apache.hadoop.yarn.server.TestContainerManagerSecurity)  Time elapsed: 1.756 sec  &lt;&lt;&lt; ERROR!
+org.apache.hadoop.yarn.exceptions.YarnRuntimeException: java.io.IOException: ResourceManager failed to start. Final state is STOPPED
+  at org.apache.hadoop.yarn.server.MiniYARNCluster$ResourceManagerWrapper.serviceStart(MiniYARNCluster.java:253)
+  at org.apache.hadoop.service.AbstractService.start(AbstractService.java:193)
+  at org.apache.hadoop.service.CompositeService.serviceStart(CompositeService.java:121)
+  at org.apache.hadoop.service.AbstractService.start(AbstractService.java:193)
+  at org.apache.hadoop.yarn.server.TestContainerManagerSecurity.testContainerManager(TestContainerManagerSecurity.java:110)
+{code}</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1454">YARN-1454</a>.
+     Critical bug reported by Jian He and fixed by Karthik Kambatla <br>
+     <b>TestRMRestart.testRMDelegationTokenRestoredOnRMRestart is failing intermittently </b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1451">YARN-1451</a>.
+     Minor bug reported by Sandy Ryza and fixed by Sandy Ryza <br>
+     <b>TestResourceManager relies on the scheduler assigning multiple containers in a single node update</b><br>
+     <blockquote>TestResourceManager rely on the capacity scheduler.
+
+It relies on a scheduler that assigns multiple containers in a single heartbeat, which not all schedulers do by default.  It also relies on schedulers that don't consider CPU capacities.  It would be simple to change the test to use multiple heartbeats and increase the vcore capacities of the nodes in the test.</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1450">YARN-1450</a>.
+     Major bug reported by Akira AJISAKA and fixed by Binglin Chang (applications/distributed-shell)<br>
+     <b>TestUnmanagedAMLauncher#testDSShell fails on trunk</b><br>
+     <blockquote>TestUnmanagedAMLauncher fails on trunk. The console output is
+{code}
+Running org.apache.hadoop.yarn.applications.unmanagedamlauncher.TestUnmanagedAMLauncher
+Tests run: 2, Failures: 0, Errors: 1, Skipped: 0, Time elapsed: 35.937 sec &lt;&lt;&lt; FAILURE! - in org.apache.hadoop.yarn.applications.unmanagedamlauncher.TestUnmanagedAMLauncher
+testDSShell(org.apache.hadoop.yarn.applications.unmanagedamlauncher.TestUnmanagedAMLauncher)  Time elapsed: 14.558 sec  &lt;&lt;&lt; ERROR!
+java.lang.RuntimeException: Failed to receive final expected state in ApplicationReport, CurrentState=ACCEPTED, ExpectedStates=FINISHED,FAILED,KILLED
+	at org.apache.hadoop.yarn.applications.unmanagedamlauncher.UnmanagedAMLauncher.monitorApplication(UnmanagedAMLauncher.java:447)
+	at org.apache.hadoop.yarn.applications.unmanagedamlauncher.UnmanagedAMLauncher.run(UnmanagedAMLauncher.java:352)
+	at org.apache.hadoop.yarn.applications.unmanagedamlauncher.TestUnmanagedAMLauncher.testDSShell(TestUnmanagedAMLauncher.java:145)
+{code}</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1448">YARN-1448</a>.
+     Major sub-task reported by Wangda Tan and fixed by Wangda Tan (api , resourcemanager)<br>
+     <b>AM-RM protocol changes to support container resizing</b><br>
+     <blockquote>As described in YARN-1197, we need add API in RM to support
+1) Add increase request in AllocateRequest
+2) Can get successfully increased/decreased containers from RM in AllocateResponse</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1447">YARN-1447</a>.
+     Major sub-task reported by Wangda Tan and fixed by Wangda Tan (api)<br>
+     <b>Common PB type definitions for container resizing</b><br>
+     <blockquote>As described in YARN-1197, we need add some common PB types for container resource change, like ResourceChangeContext, etc. These types will be both used by RM/NM protocols</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1446">YARN-1446</a>.
+     Major sub-task reported by Jian He and fixed by Jian He (resourcemanager)<br>
+     <b>Change killing application to wait until state store is done</b><br>
+     <blockquote>When user kills an application, it should wait until the state store is done with saving the killed status of the application. Otherwise, if RM crashes in the middle between user killing the application and writing the status to the store, RM will relaunch this application after it restarts.</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1435">YARN-1435</a>.
+     Major bug reported by Tassapol Athiapinya and fixed by Xuan Gong (applications/distributed-shell)<br>
+     <b>Distributed Shell should not run other commands except "sh", and run the custom script at the same time.</b><br>
+     <blockquote>Currently, if we want to run custom script at DS. We can do it like this :
+--shell_command sh --shell_script custom_script.sh
+But it may be better to separate running shell_command and shell_script</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1425">YARN-1425</a>.
+     Major bug reported by Omkar Vinit Joshi and fixed by Omkar Vinit Joshi <br>
+     <b>TestRMRestart fails because MockRM.waitForState(AttemptId) uses current attempt instead of the attempt passed as argument</b><br>
+     <blockquote>TestRMRestart is failing on trunk. Fixing it. </blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1423">YARN-1423</a>.
+     Major improvement reported by Sandy Ryza and fixed by Ted Malaska (scheduler)<br>
+     <b>Support queue placement by secondary group in the Fair Scheduler</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1419">YARN-1419</a>.
+     Minor bug reported by Jonathan Eagles and fixed by Jonathan Eagles (scheduler)<br>
+     <b>TestFifoScheduler.testAppAttemptMetrics fails intermittently under jdk7 </b><br>
+     <blockquote>QueueMetrics holds its data in a static variable causing metrics to bleed over from test to test. clearQueueMetrics is to be called for tests that need to measure metrics correctly for a single test. jdk7 comes into play since tests are run out of order, and in the case make the metrics unreliable.</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1416">YARN-1416</a>.
+     Major bug reported by Omkar Vinit Joshi and fixed by Jian He <br>
+     <b>InvalidStateTransitions getting reported in multiple test cases even though they pass</b><br>
+     <blockquote>It might be worth checking why they are reporting this.
+Testcase : TestRMAppTransitions, TestRM
+there are large number of such errors.
+can't handle RMAppEventType.APP_UPDATE_SAVED at RMAppState.FAILED
+</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1411">YARN-1411</a>.
+     Critical sub-task reported by Karthik Kambatla and fixed by Karthik Kambatla <br>
+     <b>HA config shouldn't affect NodeManager RPC addresses</b><br>
+     <blockquote>When HA is turned on, {{YarnConfiguration#getSoketAddress()}} fetches rpc-addresses corresponding to the specified rm-id. This should only be for RM rpc-addresses. Other confs, like NM rpc-addresses shouldn't be affected by this.
+
+Currently, the NM address settings in yarn-site.xml aren't reflected in the actual ports.</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1409">YARN-1409</a>.
+     Major bug reported by Tsuyoshi OZAWA and fixed by Tsuyoshi OZAWA <br>
+     <b>NonAggregatingLogHandler can throw RejectedExecutionException</b><br>
+     <blockquote>This problem is caused by handling APPLICATION_FINISHED events after calling sched.shotdown() in NonAggregatingLongHandler#serviceStop(). org.apache.hadoop.mapred.TestJobCleanup can fail because of RejectedExecutionException by NonAggregatingLogHandler.
+
+{code}
+2013-11-13 10:53:06,970 FATAL [AsyncDispatcher event handler] event.AsyncDispatcher (AsyncDispatcher.java:dispatch(166)) - Error in dispatcher thread
+java.util.concurrent.RejectedExecutionException: Task java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask@d51df63 rejected from java.util.concurrent.ScheduledThreadPoolExecutor@7a20e369[Shutting down, pool size = 4, active threads = 0, queued tasks = 7, completed tasks = 0]
+        at java.util.concurrent.ThreadPoolExecutor$AbortPolicy.rejectedExecution(ThreadPoolExecutor.java:2048)
+        at java.util.concurrent.ThreadPoolExecutor.reject(ThreadPoolExecutor.java:821)
+        at java.util.concurrent.ScheduledThreadPoolExecutor.delayedExecute(ScheduledThreadPoolExecutor.java:325)
+        at java.util.concurrent.ScheduledThreadPoolExecutor.schedule(ScheduledThreadPoolExecutor.java:530)
+        at org.apache.hadoop.yarn.server.nodemanager.containermanager.loghandler.NonAggregatingLogHandler.handle(NonAggregatingLogHandler.java:121)
+        at org.apache.hadoop.yarn.server.nodemanager.containermanager.loghandler.NonAggregatingLogHandler.handle(NonAggregatingLogHandler.java:49)
+        at org.apache.hadoop.yarn.event.AsyncDispatcher.dispatch(AsyncDispatcher.java:159)
+        at org.apache.hadoop.yarn.event.AsyncDispatcher$1.run(AsyncDispatcher.java:95)
+        at java.lang.Thread.run(Thread.java:724)
+{code}</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1407">YARN-1407</a>.
+     Major bug reported by Sandy Ryza and fixed by Sandy Ryza <br>
+     <b>RM Web UI and REST APIs should uniformly use YarnApplicationState</b><br>
+     <blockquote>RMAppState isn't a public facing enum like YarnApplicationState, so we shouldn't return values or list filters that come from it. However, some Blocks and AppInfo are still using RMAppState.
+
+It is not 100% clear to me whether or not fixing this would be a backwards-incompatible change.  The change would only reduce the set of possible strings that the API returns, so I think not.  We have also been changing the contents of RMAppState since 2.2.0, e.g. in YARN-891. It would still be good to fix this ASAP (i.e. for 2.2.1).</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1405">YARN-1405</a>.
+     Major sub-task reported by Yesha Vora and fixed by Jian He <br>
+     <b>RM hangs on shutdown if calling system.exit in serviceInit or serviceStart</b><br>
+     <blockquote>Enable yarn.resourcemanager.recovery.enabled=true and Pass a local path to yarn.resourcemanager.fs.state-store.uri. such as "file:///tmp/MYTMP"
+
+if the directory  /tmp/MYTMP is not readable or writable, RM should crash and should print "Permission denied Error"
+
+Currently, RM throws "java.io.FileNotFoundException: File file:/tmp/MYTMP/FSRMStateRoot/RMDTSecretManagerRoot does not exist" Error. RM returns Exiting status 1 but RM process does not shutdown. 
+
+Snapshot of Resource manager log:
+
+2013-09-27 18:31:36,621 INFO  security.NMTokenSecretManagerInRM (NMTokenSecretManagerInRM.java:rollMasterKey(97)) - Rolling master-key for nm-tokens
+2013-09-27 18:31:36,694 ERROR resourcemanager.ResourceManager (ResourceManager.java:serviceStart(640)) - Failed to load/recover state
+java.io.FileNotFoundException: File file:/tmp/MYTMP/FSRMStateRoot/RMDTSecretManagerRoot does not exist
+        at org.apache.hadoop.fs.RawLocalFileSystem.listStatus(RawLocalFileSystem.java:379)
+        at org.apache.hadoop.fs.FileSystem.listStatus(FileSystem.java:1478)
+        at org.apache.hadoop.fs.FileSystem.listStatus(FileSystem.java:1518)
+        at org.apache.hadoop.fs.ChecksumFileSystem.listStatus(ChecksumFileSystem.java:564)
+        at org.apache.hadoop.yarn.server.resourcemanager.recovery.FileSystemRMStateStore.loadRMDTSecretManagerState(FileSystemRMStateStore.java:188)
+        at org.apache.hadoop.yarn.server.resourcemanager.recovery.FileSystemRMStateStore.loadState(FileSystemRMStateStore.java:112)
+        at org.apache.hadoop.yarn.server.resourcemanager.ResourceManager.serviceStart(ResourceManager.java:635)
+        at org.apache.hadoop.service.AbstractService.start(AbstractService.java:193)
+        at org.apache.hadoop.yarn.server.resourcemanager.ResourceManager.main(ResourceManager.java:855)
+2013-09-27 18:31:36,697 INFO  util.ExitUtil (ExitUtil.java:terminate(124)) - Exiting with status 1</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1403">YARN-1403</a>.
+     Major improvement reported by Sandy Ryza and fixed by Sandy Ryza <br>
+     <b>Separate out configuration loading from QueueManager in the Fair Scheduler</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1401">YARN-1401</a>.
+     Major bug reported by Gera Shegalov and fixed by Gera Shegalov (nodemanager)<br>
+     <b>With zero sleep-delay-before-sigkill.ms, no signal is ever sent</b><br>
+     <blockquote>If you set in yarn-site.xml yarn.nodemanager.sleep-delay-before-sigkill.ms=0 then an unresponsive child JVM is never killed. In MRv1, TT used to immediately SIGKILL in this case. </blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1400">YARN-1400</a>.
+     Trivial bug reported by Raja Aluri and fixed by Raja Aluri (resourcemanager)<br>
+     <b>yarn.cmd uses HADOOP_RESOURCEMANAGER_OPTS. Should be YARN_RESOURCEMANAGER_OPTS.</b><br>
+     <blockquote>yarn.cmd uses HADOOP_RESOURCEMANAGER_OPTS. Should be YARN_RESOURCEMANAGER_OPTS.</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1395">YARN-1395</a>.
+     Major bug reported by Chris Nauroth and fixed by Chris Nauroth (applications/distributed-shell)<br>
+     <b>Distributed shell application master launched with debug flag can hang waiting for external ls process.</b><br>
+     <blockquote>Distributed shell launched with the debug flag will run {{ApplicationMaster#dumpOutDebugInfo}}.  This method launches an external process to run ls and print the contents of the current working directory.  We've seen that this can cause the application master to hang on {{Process#waitFor}}.</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1392">YARN-1392</a>.
+     Major new feature reported by Sandy Ryza and fixed by Sandy Ryza (scheduler)<br>
+     <b>Allow sophisticated app-to-queue placement policies in the Fair Scheduler</b><br>
+     <blockquote>Currently the Fair Scheduler supports app-to-queue placement by username.  It would be beneficial to allow more sophisticated policies that rely on primary and secondary groups and fallbacks.</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1388">YARN-1388</a>.
+     Trivial bug reported by Liyin Liang and fixed by Liyin Liang (resourcemanager)<br>
+     <b>Fair Scheduler page always displays blank fair share</b><br>
+     <blockquote>YARN-1044 fixed min/max/used resource display problem in the scheduler  page. But the "Fair Share" has the same problem and need to fix it.</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1387">YARN-1387</a>.
+     Major improvement reported by Karthik Kambatla and fixed by Karthik Kambatla (api)<br>
+     <b>RMWebServices should use ClientRMService for filtering applications</b><br>
+     <blockquote>YARN's REST API allows filtering applications, this should be moved to ClientRMService to allow Java API also support the same functionality.</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1386">YARN-1386</a>.
+     Critical bug reported by Jason Lowe and fixed by Jason Lowe (nodemanager)<br>
+     <b>NodeManager mistakenly loses resources and relocalizes them</b><br>
+     <blockquote>When a local resource that should already be present is requested again, the nodemanager checks to see if it still present.  However the method it uses to check for presence is via File.exists() as the user of the nodemanager process. If the resource was a private resource localized for another user, it will be localized to a location that is not accessible by the nodemanager user.  Therefore File.exists() returns false, the nodemanager mistakenly believes the resource is no longer available, and it proceeds to localize it over and over.</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1381">YARN-1381</a>.
+     Minor bug reported by Ted Yu and fixed by Ted Yu <br>
+     <b>Same relaxLocality appears twice in exception message of AMRMClientImpl#checkLocalityRelaxationConflict() </b><br>
+     <blockquote>Here is related code:
+{code}
+            throw new InvalidContainerRequestException("Cannot submit a "
+                + "ContainerRequest asking for location " + location
+                + " with locality relaxation " + relaxLocality + " when it has "
+                + "already been requested with locality relaxation " + relaxLocality);
+{code}
+The last relaxLocality should be  reqs.values().iterator().next().remoteRequest.getRelaxLocality() </blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1378">YARN-1378</a>.
+     Major sub-task reported by Jian He and fixed by Jian He (resourcemanager)<br>
+     <b>Implement a RMStateStore cleaner for deleting application/attempt info</b><br>
+     <blockquote>Now that we are storing the final state of application/attempt instead of removing application/attempt info on application/attempt completion(YARN-891), we need a separate RMStateStore cleaner for cleaning the application/attempt state.</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1374">YARN-1374</a>.
+     Blocker bug reported by Devaraj K and fixed by Karthik Kambatla (resourcemanager)<br>
+     <b>Resource Manager fails to start due to ConcurrentModificationException</b><br>
+     <blockquote>Resource Manager is failing to start with the below ConcurrentModificationException.
+
+{code:xml}
+2013-10-30 20:22:42,371 INFO org.apache.hadoop.util.HostsFileReader: Refreshing hosts (include/exclude) list
+2013-10-30 20:22:42,376 INFO org.apache.hadoop.service.AbstractService: Service ResourceManager failed in state INITED; cause: java.util.ConcurrentModificationException
+java.util.ConcurrentModificationException
+	at java.util.AbstractList$Itr.checkForComodification(AbstractList.java:372)
+	at java.util.AbstractList$Itr.next(AbstractList.java:343)
+	at java.util.Collections$UnmodifiableCollection$1.next(Collections.java:1010)
+	at org.apache.hadoop.service.CompositeService.serviceInit(CompositeService.java:107)
+	at org.apache.hadoop.yarn.server.resourcemanager.ResourceManager.serviceInit(ResourceManager.java:187)
+	at org.apache.hadoop.service.AbstractService.init(AbstractService.java:163)
+	at org.apache.hadoop.yarn.server.resourcemanager.ResourceManager.main(ResourceManager.java:944)
+2013-10-30 20:22:42,378 INFO org.apache.hadoop.yarn.server.resourcemanager.RMHAProtocolService: Transitioning to standby
+2013-10-30 20:22:42,378 INFO org.apache.hadoop.yarn.server.resourcemanager.RMHAProtocolService: Transitioned to standby
+2013-10-30 20:22:42,378 FATAL org.apache.hadoop.yarn.server.resourcemanager.ResourceManager: Error starting ResourceManager
+java.util.ConcurrentModificationException
+	at java.util.AbstractList$Itr.checkForComodification(AbstractList.java:372)
+	at java.util.AbstractList$Itr.next(AbstractList.java:343)
+	at java.util.Collections$UnmodifiableCollection$1.next(Collections.java:1010)
+	at org.apache.hadoop.service.CompositeService.serviceInit(CompositeService.java:107)
+	at org.apache.hadoop.yarn.server.resourcemanager.ResourceManager.serviceInit(ResourceManager.java:187)
+	at org.apache.hadoop.service.AbstractService.init(AbstractService.java:163)
+	at org.apache.hadoop.yarn.server.resourcemanager.ResourceManager.main(ResourceManager.java:944)
+2013-10-30 20:22:42,379 INFO org.apache.hadoop.yarn.server.resourcemanager.ResourceManager: SHUTDOWN_MSG: 
+/************************************************************
+SHUTDOWN_MSG: Shutting down ResourceManager at HOST-10-18-40-24/10.18.40.24
+************************************************************/
+{code}</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1358">YARN-1358</a>.
+     Minor test reported by Chuan Liu and fixed by Chuan Liu (client)<br>
+     <b>TestYarnCLI fails on Windows due to line endings</b><br>
+     <blockquote>The unit test fails on Windows due to incorrect line endings was used for comparing the output from command line output. Error messages are as follows.
+{noformat}
+junit.framework.ComparisonFailure: expected:&lt;...argument for options[]
+usage: application
+...&gt; but was:&lt;...argument for options[
+]
+usage: application
+...&gt;
+	at junit.framework.Assert.assertEquals(Assert.java:85)
+	at junit.framework.Assert.assertEquals(Assert.java:91)
+	at org.apache.hadoop.yarn.client.cli.TestYarnCLI.testMissingArguments(TestYarnCLI.java:878)
+{noformat}</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1357">YARN-1357</a>.
+     Minor test reported by Chuan Liu and fixed by Chuan Liu (nodemanager)<br>
+     <b>TestContainerLaunch.testContainerEnvVariables fails on Windows</b><br>
+     <blockquote>This test fails on Windows due to incorrect use of batch script command. Error messages are as follows.
+{noformat}
+junit.framework.AssertionFailedError: expected:&lt;java.nio.HeapByteBuffer[pos=0 lim=19 cap=19]&gt; but was:&lt;java.nio.HeapByteBuffer[pos=0 lim=19 cap=19]&gt;
+	at junit.framework.Assert.fail(Assert.java:50)
+	at junit.framework.Assert.failNotEquals(Assert.java:287)
+	at junit.framework.Assert.assertEquals(Assert.java:67)
+	at junit.framework.Assert.assertEquals(Assert.java:74)
+	at org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.TestContainerLaunch.testContainerEnvVariables(TestContainerLaunch.java:508)
+{noformat}</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1351">YARN-1351</a>.
+     Trivial bug reported by Konstantin Weitz and fixed by Konstantin Weitz (resourcemanager)<br>
+     <b>Invalid string format in Fair Scheduler log warn message</b><br>
+     <blockquote>While trying to print a warning, two values of the wrong type (Resource instead of int) are passed into a String.format method call, leading to a runtime exception, in the file:
+
+_trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/QueueManager.java_.
+
+The warning was intended to be printed whenever the resources don't fit into each other, either because the number of virtual cores or the memory is too small. I changed the %d's into %s, this way the warning will contain both the cores and the memory.
+</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1349">YARN-1349</a>.
+     Major bug reported by Chris Nauroth and fixed by Chris Nauroth (client)<br>
+     <b>yarn.cmd does not support passthrough to any arbitrary class.</b><br>
+     <blockquote>The yarn shell script supports passthrough to calling any arbitrary class if the first argument is not one of the per-defined sub-commands.  The equivalent cmd script does not implement this and instead fails trying to do a labeled goto to the first argument.</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1343">YARN-1343</a>.
+     Critical bug reported by Alejandro Abdelnur and fixed by Alejandro Abdelnur (resourcemanager)<br>
+     <b>NodeManagers additions/restarts are not reported as node updates in AllocateResponse responses to AMs</b><br>
+     <blockquote>If a NodeManager joins the cluster or gets restarted, running AMs never receive the node update indicating the Node is running.</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1335">YARN-1335</a>.
+     Major improvement reported by Sandy Ryza and fixed by Sandy Ryza (scheduler)<br>
+     <b>Move duplicate code from FSSchedulerApp and FiCaSchedulerApp into SchedulerApplication</b><br>
+     <blockquote>FSSchedulerApp and FiCaSchedulerApp use duplicate code in a lot of places.  They both extend SchedulerApplication.  We can move a lot of this duplicate code into SchedulerApplication.</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1333">YARN-1333</a>.
+     Major improvement reported by Sandy Ryza and fixed by Tsuyoshi OZAWA (scheduler)<br>
+     <b>Support blacklisting in the Fair Scheduler</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1332">YARN-1332</a>.
+     Minor improvement reported by Sandy Ryza and fixed by Sebastian Wong <br>
+     <b>In TestAMRMClient, replace assertTrue with assertEquals where possible</b><br>
+     <blockquote>TestAMRMClient uses a lot of "assertTrue(amClient.ask.size() == 0)" where "assertEquals(0, amClient.ask.size())" would make it easier to see why it's failing at a glance.</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1331">YARN-1331</a>.
+     Trivial bug reported by Chris Nauroth and fixed by Chris Nauroth (client)<br>
+     <b>yarn.cmd exits with NoClassDefFoundError trying to run rmadmin or logs</b><br>
+     <blockquote>The yarn shell script was updated so that the rmadmin and logs sub-commands launch {{org.apache.hadoop.yarn.client.cli.RMAdminCLI}} and {{org.apache.hadoop.yarn.client.cli.LogsCLI}}.  The yarn.cmd script also needs to be updated so that the commands work on Windows.</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1325">YARN-1325</a>.
+     Major sub-task reported by Tsuyoshi OZAWA and fixed by Xuan Gong (resourcemanager)<br>
+     <b>Enabling HA should check Configuration contains multiple RMs</b><br>
+     <blockquote>Currently, we can enable RM HA configuration without multiple RM ids(YarnConfiguration.RM_HA_IDS).  This behaviour can cause wrong operations. ResourceManager should verify that more than 1 RM id must be specified in RM-HA-IDs.
+
+One idea is to support "strict mode" to enforce this check as configuration(e.g. yarn.resourcemanager.ha.strict-mode.enabled).</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1323">YARN-1323</a>.
+     Major sub-task reported by Karthik Kambatla and fixed by Karthik Kambatla <br>
+     <b>Set HTTPS webapp address along with other RPC addresses in HAUtil</b><br>
+     <blockquote>YARN-1232 adds the ability to configure multiple RMs, but missed out the https web app address. Need to add that in.</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1321">YARN-1321</a>.
+     Blocker bug reported by Alejandro Abdelnur and fixed by Alejandro Abdelnur (client)<br>
+     <b>NMTokenCache is a singleton, prevents multiple AMs running in a single JVM to work correctly</b><br>
+     <blockquote>NMTokenCache is a singleton. Because of this, if running multiple AMs in a single JVM NMTokens for the same node from different AMs step on each other and starting containers fail due to mismatch tokens.
+
+The error observed in the client side is something like:
+
+{code}
+ERROR org.apache.hadoop.security.UserGroupInformation: PriviledgedActionException as:llama (auth:PROXY) via llama (auth:SIMPLE) cause:org.apache.hadoop.yarn.exceptions.YarnException: Unauthorized request to start container. 
+NMToken for application attempt : appattempt_1382038445650_0002_000001 was used for starting container with container token issued for application attempt : appattempt_1382038445650_0001_000001
+{code}
+</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1320">YARN-1320</a>.
+     Major bug reported by Tassapol Athiapinya and fixed by Xuan Gong (applications/distributed-shell)<br>
+     <b>Custom log4j properties in Distributed shell does not work properly.</b><br>
+     <blockquote>Distributed shell cannot pick up custom log4j properties (specified with -log_properties). It always uses default log4j properties.</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1318">YARN-1318</a>.
+     Blocker sub-task reported by Karthik Kambatla and fixed by Karthik Kambatla (resourcemanager)<br>
+     <b>Promote AdminService to an Always-On service and merge in RMHAProtocolService</b><br>
+     <blockquote>Per discussion in YARN-1068, we want AdminService to handle HA-admin operations in addition to the regular non-HA admin operations. To facilitate this, we need to move AdminService an Always-On service. </blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1315">YARN-1315</a>.
+     Major bug reported by Sandy Ryza and fixed by Sandy Ryza (resourcemanager , scheduler)<br>
+     <b>TestQueueACLs should also test FairScheduler</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1314">YARN-1314</a>.
+     Major bug reported by Tassapol Athiapinya and fixed by Xuan Gong (applications/distributed-shell)<br>
+     <b>Cannot pass more than 1 argument to shell command</b><br>
+     <blockquote>Distributed shell cannot accept more than 1 parameters in argument parts.
+
+All of these commands are treated as 1 parameter:
+
+/usr/bin/yarn  org.apache.hadoop.yarn.applications.distributedshell.Client -jar &lt;distrubuted shell jar&gt; -shell_command echo -shell_args "'"My   name"                "is  Teddy"'"
+/usr/bin/yarn  org.apache.hadoop.yarn.applications.distributedshell.Client -jar &lt;distrubuted shell jar&gt; -shell_command echo -shell_args "''My   name'                'is  Teddy''"
+/usr/bin/yarn  org.apache.hadoop.yarn.applications.distributedshell.Client -jar &lt;distrubuted shell jar&gt; -shell_command echo -shell_args "'My   name'                'is  Teddy'"</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1311">YARN-1311</a>.
+     Trivial sub-task reported by Vinod Kumar Vavilapalli and fixed by Vinod Kumar Vavilapalli <br>
+     <b>Fix app specific scheduler-events' names to be app-attempt based</b><br>
+     <blockquote>Today, APP_ADDED and APP_REMOVED are sent to the scheduler. They are misnomers as schedulers only deal with AppAttempts today. This JIRA is for fixing their names so that we can add App-level events in the near future, notably for work-preserving RM-restart.</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1307">YARN-1307</a>.
+     Major sub-task reported by Tsuyoshi OZAWA and fixed by Tsuyoshi OZAWA (resourcemanager)<br>
+     <b>Rethink znode structure for RM HA</b><br>
+     <blockquote>Rethink for znode structure for RM HA is proposed in some JIRAs(YARN-659, YARN-1222). The motivation of this JIRA is quoted from Bikas' comment in YARN-1222:
+{quote}
+We should move to creating a node hierarchy for apps such that all znodes for an app are stored under an app znode instead of the app root znode. This will help in removeApplication and also in scaling better on ZK. The earlier code was written this way to ensure create/delete happens under a root znode for fencing. But given that we have moved to multi-operations globally, this isnt required anymore.
+{quote}</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1306">YARN-1306</a>.
+     Major bug reported by Wei Yan and fixed by Wei Yan <br>
+     <b>Clean up hadoop-sls sample-conf according to YARN-1228</b><br>
+     <blockquote>Move fair scheduler allocations configuration to fair-scheduler.xml, and move all scheduler stuffs to yarn-site.xml</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1305">YARN-1305</a>.
+     Major sub-task reported by Tsuyoshi OZAWA and fixed by Tsuyoshi OZAWA (resourcemanager)<br>
+     <b>RMHAProtocolService#serviceInit should handle HAUtil's IllegalArgumentException</b><br>
+     <blockquote>When yarn.resourcemanager.ha.enabled is true, RMHAProtocolService#serviceInit calls HAUtil.setAllRpcAddresses. If the configuration values are null, it just throws IllegalArgumentException.
+It's messy to analyse which keys are null, so we should handle it and log the name of keys which are null.
+
+A current log dump is as follows:
+{code}
+2013-10-15 06:24:53,431 INFO org.apache.hadoop.yarn.server.resourcemanager.ResourceManager: registered UNIX signal handlers for [TERM, HUP, INT]
+2013-10-15 06:24:54,203 INFO org.apache.hadoop.service.AbstractService: Service RMHAProtocolService failed in state INITED; cause: java.lang.IllegalArgumentException: Property value must not be null
+java.lang.IllegalArgumentException: Property value must not be null
+        at com.google.common.base.Preconditions.checkArgument(Preconditions.java:88)
+        at org.apache.hadoop.conf.Configuration.set(Configuration.java:816)
+        at org.apache.hadoop.conf.Configuration.set(Configuration.java:798)
+        at org.apache.hadoop.yarn.conf.HAUtil.setConfValue(HAUtil.java:100)
+        at org.apache.hadoop.yarn.conf.HAUtil.setAllRpcAddresses(HAUtil.java:105)
+        at org.apache.hadoop.yarn.server.resourcemanager.RMHAProtocolService.serviceInit(RMHAProtocolService.java:60)
+        at org.apache.hadoop.service.AbstractService.init(AbstractService.java:163)
+        at org.apache.hadoop.service.CompositeService.serviceInit(CompositeService.java:108)
+        at org.apache.hadoop.yarn.server.resourcemanager.ResourceManager.serviceInit(ResourceManager.java:187)
+        at org.apache.hadoop.service.AbstractService.init(AbstractService.java:163)
+        at org.apache.hadoop.yarn.server.resourcemanager.ResourceManager.main(ResourceManager.java:940)
+{code}</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1303">YARN-1303</a>.
+     Major improvement reported by Tassapol Athiapinya and fixed by Xuan Gong (applications/distributed-shell)<br>
+     <b>Allow multiple commands separating with ";" in distributed-shell</b><br>
+     <blockquote>In shell, we can do "ls; ls" to run 2 commands at once. 
+
+In distributed shell, this is not working. We should improve to allow this to occur. There are practical use cases that I know of to run multiple commands or to set environment variables before a command.</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1300">YARN-1300</a>.
+     Major bug reported by Ted Yu and fixed by Ted Yu <br>
+     <b>SLS tests fail because conf puts yarn properties in fair-scheduler.xml</b><br>
+     <blockquote>I was looking at https://builds.apache.org/job/PreCommit-YARN-Build/2165//testReport/org.apache.hadoop.yarn.sls/TestSLSRunner/testSimulatorRunning/
+I am able to reproduce the failure locally.
+
+I found that FairSchedulerConfiguration.getAllocationFile() doesn't read the yarn.scheduler.fair.allocation.file config entry from fair-scheduler.xml
+
+This leads to the following:
+{code}
+Caused by: org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.AllocationConfigurationException: Bad fair scheduler config file: top-level element not &lt;allocations&gt;
+	at org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.QueueManager.reloadAllocs(QueueManager.java:302)
+	at org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.QueueManager.initialize(QueueManager.java:108)
+	at org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler.reinitialize(FairScheduler.java:1145)
+{code}</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1295">YARN-1295</a>.
+     Major bug reported by Sandy Ryza and fixed by Sandy Ryza (nodemanager)<br>
+     <b>In UnixLocalWrapperScriptBuilder, using bash -c can cause "Text file busy" errors</b><br>
+     <blockquote>I missed this when working on YARN-1271.</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1293">YARN-1293</a>.
+     Major bug reported by Tsuyoshi OZAWA and fixed by Tsuyoshi OZAWA <br>
+     <b>TestContainerLaunch.testInvalidEnvSyntaxDiagnostics fails on trunk</b><br>
+     <blockquote>{quote}
+-------------------------------------------------------------------------------
+Test set: org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.TestContainerLaunch
+-------------------------------------------------------------------------------
+Tests run: 8, Failures: 1, Errors: 0, Skipped: 0, Time elapsed: 12.655 sec &lt;&lt;&lt; FAILURE! - in org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.TestContainerLaunch
+testInvalidEnvSyntaxDiagnostics(org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.TestContainerLaunch)  Time elapsed: 0.114 sec  &lt;&lt;&lt; FAILURE!
+junit.framework.AssertionFailedError: null
+        at junit.framework.Assert.fail(Assert.java:48)
+        at junit.framework.Assert.assertTrue(Assert.java:20)
+        at junit.framework.Assert.assertTrue(Assert.java:27)
+        at org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.TestContainerLaunch.testInvalidEnvSyntaxDiagnostics(TestContainerLaunch.java:273)
+{quote}</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1290">YARN-1290</a>.
+     Major improvement reported by Wei Yan and fixed by Wei Yan <br>
+     <b>Let continuous scheduling achieve more balanced task assignment</b><br>
+     <blockquote>Currently, in continuous scheduling (YARN-1010), in each round, the thread iterates over pre-ordered nodes and assigns tasks. This mechanism may overload the first several nodes, while the latter nodes have no tasks.
+
+We should sort all nodes according to available resource. In each round, always assign tasks to nodes with larger capacity, which can balance the load distribution among all nodes.</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1288">YARN-1288</a>.
+     Major bug reported by Sandy Ryza and fixed by Sandy Ryza (scheduler)<br>
+     <b>Make Fair Scheduler ACLs more user friendly</b><br>
+     <blockquote>The Fair Scheduler currently defaults the root queue's acl to empty and all other queues' acl to "*".  Now that YARN-1258 enables configuring the root queue, we should reverse this.  This will also bring the Fair Scheduler in line with the Capacity Scheduler.
+
+We should also not trim the acl strings, which makes it impossible to only specify groups in an acl.</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1284">YARN-1284</a>.
+     Blocker bug reported by Alejandro Abdelnur and fixed by Alejandro Abdelnur (nodemanager)<br>
+     <b>LCE: Race condition leaves dangling cgroups entries for killed containers</b><br>
+     <blockquote>When LCE &amp; cgroups are enabled, when a container is is killed (in this case by its owning AM, an MRAM) it seems to be a race condition at OS level when doing a SIGTERM/SIGKILL and when the OS does all necessary cleanup. 
+
+LCE code, after sending the SIGTERM/SIGKILL and getting the exitcode, immediately attempts to clean up the cgroups entry for the container. But this is failing with an error like:
+
+{code}
+2013-10-07 15:21:24,359 WARN org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor: Exit code from container container_1381179532433_0016_01_000011 is : 143
+2013-10-07 15:21:24,359 DEBUG org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container: Processing container_1381179532433_0016_01_000011 of type UPDATE_DIAGNOSTICS_MSG
+2013-10-07 15:21:24,359 DEBUG org.apache.hadoop.yarn.server.nodemanager.util.CgroupsLCEResourcesHandler: deleteCgroup: /run/cgroups/cpu/hadoop-yarn/container_1381179532433_0016_01_000011
+2013-10-07 15:21:24,359 WARN org.apache.hadoop.yarn.server.nodemanager.util.CgroupsLCEResourcesHandler: Unable to delete cgroup at: /run/cgroups/cpu/hadoop-yarn/container_1381179532433_0016_01_000011
+{code}
+
+
+CgroupsLCEResourcesHandler.clearLimits() has logic to wait for 500 ms for AM containers to avoid this problem. it seems this should be done for all containers.
+
+Still, waiting for extra 500ms seems too expensive.
+
+We should look at a way of doing this in a more 'efficient way' from time perspective, may be spinning while the deleteCgroup() cannot be done with a minimal sleep and a timeout.
+</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1283">YARN-1283</a>.
+     Major sub-task reported by Yesha Vora and fixed by Omkar Vinit Joshi <br>
+     <b>Invalid 'url of job' mentioned in Job output with yarn.http.policy=HTTPS_ONLY</b><br>
+     <blockquote>After setting yarn.http.policy=HTTPS_ONLY, the job output shows incorrect "The url to track the job".
+
+Currently, its printing http://RM:&lt;httpsport&gt;/proxy/application_1381162886563_0001/ instead https://RM:&lt;httpsport&gt;/proxy/application_1381162886563_0001/
+
+http://hostname:8088/proxy/application_1381162886563_0001/ is invalid
+
+hadoop  jar hadoop-mapreduce-client-jobclient-tests.jar sleep -m 1 -r 1 
+13/10/07 18:39:39 INFO client.RMProxy: Connecting to ResourceManager at hostname/100.00.00.000:8032
+13/10/07 18:39:40 INFO mapreduce.JobSubmitter: number of splits:1
+13/10/07 18:39:40 INFO Configuration.deprecation: user.name is deprecated. Instead, use mapreduce.job.user.name
+13/10/07 18:39:40 INFO Configuration.deprecation: mapred.jar is deprecated. Instead, use mapreduce.job.jar
+13/10/07 18:39:40 INFO Configuration.deprecation: mapred.map.tasks.speculative.execution is deprecated. Instead, use mapreduce.map.speculative
+13/10/07 18:39:40 INFO Configuration.deprecation: mapred.reduce.tasks is deprecated. Instead, use mapreduce.job.reduces
+13/10/07 18:39:40 INFO Configuration.deprecation: mapreduce.partitioner.class is deprecated. Instead, use mapreduce.job.partitioner.class
+13/10/07 18:39:40 INFO Configuration.deprecation: mapred.reduce.tasks.speculative.execution is deprecated. Instead, use mapreduce.reduce.speculative
+13/10/07 18:39:40 INFO Configuration.deprecation: mapred.mapoutput.value.class is deprecated. Instead, use mapreduce.map.output.value.class
+13/10/07 18:39:40 INFO Configuration.deprecation: mapreduce.map.class is deprecated. Instead, use mapreduce.job.map.class
+13/10/07 18:39:40 INFO Configuration.deprecation: mapred.job.name is deprecated. Instead, use mapreduce.job.name
+13/10/07 18:39:40 INFO Configuration.deprecation: mapreduce.reduce.class is deprecated. Instead, use mapreduce.job.reduce.class
+13/10/07 18:39:40 INFO Configuration.deprecation: mapreduce.inputformat.class is deprecated. Instead, use mapreduce.job.inputformat.class
+13/10/07 18:39:40 INFO Configuration.deprecation: mapred.input.dir is deprecated. Instead, use mapreduce.input.fileinputformat.inputdir
+13/10/07 18:39:40 INFO Configuration.deprecation: mapreduce.outputformat.class is deprecated. Instead, use mapreduce.job.outputformat.class
+13/10/07 18:39:40 INFO Configuration.deprecation: mapred.map.tasks is deprecated. Instead, use mapreduce.job.maps
+13/10/07 18:39:40 INFO Configuration.deprecation: mapred.mapoutput.key.class is deprecated. Instead, use mapreduce.map.output.key.class
+13/10/07 18:39:40 INFO Configuration.deprecation: mapred.working.dir is deprecated. Instead, use mapreduce.job.working.dir
+13/10/07 18:39:40 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1381162886563_0001
+13/10/07 18:39:40 INFO impl.YarnClientImpl: Submitted application application_1381162886563_0001 to ResourceManager at hostname/100.00.00.000:8032
+13/10/07 18:39:40 INFO mapreduce.Job: The url to track the job: http://hostname:8088/proxy/application_1381162886563_0001/
+13/10/07 18:39:40 INFO mapreduce.Job: Running job: job_1381162886563_0001
+13/10/07 18:39:46 INFO mapreduce.Job: Job job_1381162886563_0001 running in uber mode : false
+13/10/07 18:39:46 INFO mapreduce.Job:  map 0% reduce 0%
+13/10/07 18:39:53 INFO mapreduce.Job:  map 100% reduce 0%
+13/10/07 18:39:58 INFO mapreduce.Job:  map 100% reduce 100%
+13/10/07 18:39:58 INFO mapreduce.Job: Job job_1381162886563_0001 completed successfully
+13/10/07 18:39:58 INFO mapreduce.Job: Counters: 43
+	File System Counters
+		FILE: Number of bytes read=26
+		FILE: Number of bytes written=177279
+		FILE: Number of read operations=0
+		FILE: Number of large read operations=0
+		FILE: Number of write operations=0
+		HDFS: Number of bytes read=48
+		HDFS: Number of bytes written=0
+		HDFS: Number of read operations=1
+		HDFS: Number of large read operations=0
+		HDFS: Number of write operations=0
+	Job Counters 
+		Launched map tasks=1
+		Launched reduce tasks=1
+		Other local map tasks=1
+		Total time spent by all maps in occupied slots (ms)=7136
+		Total time spent by all reduces in occupied slots (ms)=6062
+	Map-Reduce Framework
+		Map input records=1
+		Map output records=1
+		Map output bytes=4
+		Map output materialized bytes=22
+		Input split bytes=48
+		Combine input records=0
+		Combine output records=0
+		Reduce input groups=1
+		Reduce shuffle bytes=22
+		Reduce input records=1
+		Reduce output records=0
+		Spilled Records=2
+		Shuffled Maps =1
+		Failed Shuffles=0
+		Merged Map outputs=1
+		GC time elapsed (ms)=60
+		CPU time spent (ms)=1700
+		Physical memory (bytes) snapshot=567582720
+		Virtual memory (bytes) snapshot=4292997120
+		Total committed heap usage (bytes)=846594048
+	Shuffle Errors
+		BAD_ID=0
+		CONNECTION=0
+		IO_ERROR=0
+		WRONG_LENGTH=0
+		WRONG_MAP=0
+		WRONG_REDUCE=0
+	File Input Format Counters 
+		Bytes Read=0
+	File Output Format Counters 
+		Bytes Written=0
+
+</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1268">YARN-1268</a>.
+     Major bug reported by Sandy Ryza and fixed by Sandy Ryza (scheduler)<br>
+     <b>TestFairScheduler.testContinuousScheduling is flaky</b><br>
+     <blockquote>It looks like there's a timeout in it that's causing it to be flaky.</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1265">YARN-1265</a>.
+     Major bug reported by Sandy Ryza and fixed by Sandy Ryza (resourcemanager , scheduler)<br>
+     <b>Fair Scheduler chokes on unhealthy node reconnect</b><br>
+     <blockquote>Only nodes in the RUNNING state are tracked by schedulers.  When a node reconnects, RMNodeImpl.ReconnectNodeTransition tries to remove it, even if it's in the RUNNING state.  The FairScheduler doesn't guard against this.
+
+I think the best way to fix this is to check to see whether a node is RUNNING before telling the scheduler to remove it.</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1259">YARN-1259</a>.
+     Trivial bug reported by Sandy Ryza and fixed by Robert Kanter (scheduler)<br>
+     <b>In Fair Scheduler web UI, queue num pending and num active apps switched</b><br>
+     <blockquote>The values returned in FairSchedulerLeafQueueInfo by numPendingApplications and numActiveApplications should be switched.</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1258">YARN-1258</a>.
+     Major improvement reported by Sandy Ryza and fixed by Sandy Ryza (scheduler)<br>
+     <b>Allow configuring the Fair Scheduler root queue</b><br>
+     <blockquote>This would be useful for acls, maxRunningApps, scheduling modes, etc.
+
+The allocation file should be able to accept both:
+* An implicit root queue
+* A root queue at the top of the hierarchy with all queues under/inside of it</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1253">YARN-1253</a>.
+     Blocker new feature reported by Alejandro Abdelnur and fixed by Roman Shaposhnik (nodemanager)<br>
+     <b>Changes to LinuxContainerExecutor to run containers as a single dedicated user in non-secure mode</b><br>
+     <blockquote>When using cgroups we require LCE to be configured in the cluster to start containers. 
+
+When LCE starts containers as the user that submitted the job. While this works correctly in a secure setup, in an un-secure setup this presents a couple issues:
+
+* LCE requires all Hadoop users submitting jobs to be Unix users in all nodes
+* Because users can impersonate other users, any user would have access to any local file of other users
+
+Particularly, the second issue is not desirable as a user could get access to ssh keys of other users in the nodes or if there are NFS mounts, get to other users data outside of the cluster.</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1241">YARN-1241</a>.
+     Major bug reported by Sandy Ryza and fixed by Sandy Ryza <br>
+     <b>In Fair Scheduler, maxRunningApps does not work for non-leaf queues</b><br>
+     <blockquote>Setting the maxRunningApps property on a parent queue should make it that the sum of apps in all subqueues can't exceed it</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1239">YARN-1239</a>.
+     Major sub-task reported by Bikas Saha and fixed by Jian He (resourcemanager)<br>
+     <b>Save version information in the state store</b><br>
+     <blockquote>When creating root dir for the first time we should write version 1. If root dir exists then we should check that the version in the state store matches the version from config.</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1232">YARN-1232</a>.
+     Major sub-task reported by Karthik Kambatla and fixed by Karthik Kambatla (resourcemanager)<br>
+     <b>Configuration to support multiple RMs</b><br>
+     <blockquote>We should augment the configuration to allow users specify two RMs and the individual RPC addresses for them.</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1222">YARN-1222</a>.
+     Major sub-task reported by Bikas Saha and fixed by Karthik Kambatla <br>
+     <b>Make improvements in ZKRMStateStore for fencing</b><br>
+     <blockquote>Using multi-operations for every ZK interaction. 
+In every operation, automatically creating/deleting a lock znode that is the child of the root znode. This is to achieve fencing by modifying the create/delete permissions on the root znode.</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1210">YARN-1210</a>.
+     Major sub-task reported by Vinod Kumar Vavilapalli and fixed by Omkar Vinit Joshi <br>
+     <b>During RM restart, RM should start a new attempt only when previous attempt exits for real</b><br>
+     <blockquote>When RM recovers, it can wait for existing AMs to contact RM back and then kill them forcefully before even starting a new AM. Worst case, RM will start a new AppAttempt after waiting for 10 mins ( the expiry interval). This way we'll minimize multiple AMs racing with each other. This can help issues with downstream components like Pig, Hive and Oozie during RM restart.
+
+In the mean while, new apps will proceed as usual as existing apps wait for recovery.
+
+This can continue to be useful after work-preserving restart, so that AMs which can properly sync back up with RM can continue to run and those that don't are guaranteed to be killed before starting a new attempt.</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1199">YARN-1199</a>.
+     Major improvement reported by Mit Desai and fixed by Mit Desai <br>
+     <b>Make NM/RM Versions Available</b><br>
+     <blockquote>Now as we have the NM and RM Versions available, we can display the YARN version of nodes running in the cluster.
+
+</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1188">YARN-1188</a>.
+     Trivial bug reported by Akira AJISAKA and fixed by Tsuyoshi OZAWA <br>
+     <b>The context of QueueMetrics becomes 'default' when using FairScheduler</b><br>
+     <blockquote>I found the context of QueueMetrics changed to 'default' from 'yarn' when I was using FairScheduler.
+The context should always be 'yarn' by adding an annotation to FSQueueMetrics like below:
+
+{code}
++ @Metrics(context="yarn")
+public class FSQueueMetrics extends QueueMetrics {
+{code}</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1185">YARN-1185</a>.
+     Major sub-task reported by Jason Lowe and fixed by Omkar Vinit Joshi (resourcemanager)<br>
+     <b>FileSystemRMStateStore can leave partial files that prevent subsequent recovery</b><br>
+     <blockquote>FileSystemRMStateStore writes directly to the destination file when storing state. However if the RM were to crash in the middle of the write, the recovery method could encounter a partially-written file and either outright crash during recovery or silently load incomplete state.
+
+To avoid this, the data should be written to a temporary file and renamed to the destination file afterwards.</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1183">YARN-1183</a>.
+     Major bug reported by Andrey Klochkov and fixed by Andrey Klochkov <br>
+     <b>MiniYARNCluster shutdown takes several minutes intermittently</b><br>
+     <blockquote>As described in MAPREDUCE-5501 sometimes M/R tests leave MRAppMaster java processes living for several minutes after successful completion of the corresponding test. There is a concurrency issue in MiniYARNCluster shutdown logic which leads to this. Sometimes RM stops before an app master sends it's last report, and then the app master keeps retrying for &gt;6 minutes. In some cases it leads to failures in subsequent tests, and it affects performance of tests as app masters eat resources.</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1182">YARN-1182</a>.
+     Major bug reported by Karthik Kambatla and fixed by Karthik Kambatla <br>
+     <b>MiniYARNCluster creates and inits the RM/NM only on start()</b><br>
+     <blockquote>MiniYARNCluster creates and inits the RM/NM only on start(). It should create and init() during init() itself.</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1181">YARN-1181</a>.
+     Major sub-task reported by Karthik Kambatla and fixed by Karthik Kambatla <br>
+     <b>Augment MiniYARNCluster to support HA mode</b><br>
+     <blockquote>MiniYARNHACluster, along the lines of MiniYARNCluster, is needed for end-to-end HA tests.</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1180">YARN-1180</a>.
+     Trivial bug reported by Thomas Graves and fixed by Chen He (capacityscheduler)<br>
+     <b>Update capacity scheduler docs to include types on the configs</b><br>
+     <blockquote>The capacity scheduler docs (http://hadoop.apache.org/docs/r2.1.0-beta/hadoop-yarn/hadoop-yarn-site/CapacityScheduler.html) don't include types for all the configs. For instance the minimum-user-limit-percent doesn't say its an Int.  It also the only setting for the Resource Allocation configs that is an Int rather then a float.</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1176">YARN-1176</a>.
+     Critical bug reported by Thomas Graves and fixed by Jonathan Eagles (resourcemanager)<br>
+     <b>RM web services ClusterMetricsInfo total nodes doesn't include unhealthy nodes</b><br>
+     <blockquote>In the web services api for the cluster/metrics, the totalNodes reported doesn't include the unhealthy nodes.
+
+this.totalNodes = activeNodes + lostNodes + decommissionedNodes
+	        + rebootedNodes;</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1172">YARN-1172</a>.
+     Major sub-task reported by Karthik Kambatla and fixed by Tsuyoshi OZAWA (resourcemanager)<br>
+     <b>Convert *SecretManagers in the RM to services</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1145">YARN-1145</a>.
+     Major bug reported by Rohith and fixed by Rohith <br>
+     <b>Potential file handle leak in aggregated logs web ui</b><br>
+     <blockquote>Any problem in getting aggregated logs for rendering on web ui, then LogReader is not closed. 
+
+Now, it reader is not closed which causing many connections in close_wait state.
+
+hadoopuser@hadoopuser:&gt; jps
+*27909* JobHistoryServer
+
+DataNode port is 50010. When greped with DataNode port, many connections are in CLOSE_WAIT from JHS.
+hadoopuser@hadoopuser:&gt; netstat -tanlp |grep 50010
+tcp        0      0 10.18.40.48:50010       0.0.0.0:*               LISTEN      21453/java          
+tcp        1      0 10.18.40.48:20596       10.18.40.48:50010       CLOSE_WAIT  *27909*/java          
+tcp        1      0 10.18.40.48:19667       10.18.40.152:50010      CLOSE_WAIT  *27909*/java          
+tcp        1      0 10.18.40.48:20593       10.18.40.48:50010       CLOSE_WAIT  *27909*/java          
+tcp        1      0 10.18.40.48:12290       10.18.40.48:50010       CLOSE_WAIT  *27909*/java          
+tcp        1      0 10.18.40.48:19662       10.18.40.152:50010      CLOSE_WAIT  *27909*/java          </blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1138">YARN-1138</a>.
+     Major bug reported by Yingda Chen and fixed by Chuan Liu (api)<br>
+     <b>yarn.application.classpath is set to point to $HADOOP_CONF_DIR etc., which does not work on Windows</b><br>
+     <blockquote>yarn-default.xml has "yarn.application.classpath" entry set to $HADOOP_CONF_DIR,$HADOOP_COMMON_HOME/share/hadoop/common/,$HADOOP_COMMON_HOME/share/hadoop/common/lib/,$HADOOP_HDFS_HOME/share/hadoop/hdfs/,$HADOOP_HDFS_HOME/share/hadoop/hdfs/lib/,$HADOOP_YARN_HOME/share/hadoop/yarn/*,$HADOOP_YARN_HOME/share/hadoop/yarn/lib. It does not work on Windows which needs to be fixed.</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1121">YARN-1121</a>.
+     Major sub-task reported by Bikas Saha and fixed by Jian He (resourcemanager)<br>
+     <b>RMStateStore should flush all pending store events before closing</b><br>
+     <blockquote>on serviceStop it should wait for all internal pending events to drain before stopping.</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1119">YARN-1119</a>.
+     Major test reported by Robert Parker and fixed by Mit Desai (resourcemanager)<br>
+     <b>Add ClusterMetrics checks to tho TestRMNodeTransitions tests</b><br>
+     <blockquote>YARN-1101 identified an issue where UNHEALTHY nodes could double decrement the active nodes. We should add checks for RUNNING node transitions.</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1109">YARN-1109</a>.
+     Major improvement reported by Sandy Ryza and fixed by haosdent (nodemanager)<br>
+     <b>Demote NodeManager "Sending out status for container" logs to debug</b><br>
+     <blockquote>Diagnosing NodeManager and container launch problems is made more difficult by the enormous number of logs like
+{code}
+Sending out status for container: container_id {, app_attempt_id {, application_id {, id: 18, cluster_timestamp: 1377559361179, }, attemptId: 1, }, id: 1337, }, state: C_RUNNING, diagnostics: "Container killed by the ApplicationMaster.\n", exit_status: -1000
+{code}
+
+On an NM with a few containers I am seeing tens of these per second.</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1101">YARN-1101</a>.
+     Major bug reported by Robert Parker and fixed by Robert Parker (resourcemanager)<br>
+     <b>Active nodes can be decremented below 0</b><br>
+     <blockquote>The issue is in RMNodeImpl where both RUNNING and UNHEALTHY states that transition to a deactive state (LOST, DECOMMISSIONED, REBOOTED) use the same DeactivateNodeTransition class.  The DeactivateNodeTransition class naturally decrements the active node, however the in cases where the node has transition to UNHEALTHY the active count has already been decremented.</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1098">YARN-1098</a>.
+     Major sub-task reported by Karthik Kambatla and fixed by Karthik Kambatla (resourcemanager)<br>
+     <b>Separate out RM services into "Always On" and "Active"</b><br>
+     <blockquote>From discussion on YARN-1027, it makes sense to separate out services that are stateful and stateless. The stateless services can  run perennially irrespective of whether the RM is in Active/Standby state, while the stateful services need to  be started on transitionToActive() and completely shutdown on transitionToStandby().
+
+The external-facing stateless services should respond to the client/AM/NM requests depending on whether the RM is Active/Standby.
+</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1068">YARN-1068</a>.
+     Major sub-task reported by Karthik Kambatla and fixed by Karthik Kambatla (resourcemanager)<br>
+     <b>Add admin support for HA operations</b><br>
+     <blockquote>Support HA admin operations to facilitate transitioning the RM to Active and Standby states.</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1060">YARN-1060</a>.
+     Major bug reported by Sandy Ryza and fixed by Niranjan Singh (scheduler)<br>
+     <b>Two tests in TestFairScheduler are missing @Test annotation</b><br>
+     <blockquote>Amazingly, these tests appear to pass with the annotations added.</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1053">YARN-1053</a>.
+     Blocker bug reported by Omkar Vinit Joshi and fixed by Omkar Vinit Joshi <br>
+     <b>Diagnostic message from ContainerExitEvent is ignored in ContainerImpl</b><br>
+     <blockquote>If the container launch fails then we send ContainerExitEvent. This event contains exitCode and diagnostic message. Today we are ignoring diagnostic message while handling this event inside ContainerImpl. Fixing it as it is useful in diagnosing the failure.</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1044">YARN-1044</a>.
+     Critical bug reported by Sangjin Lee and fixed by Sangjin Lee (resourcemanager , scheduler)<br>
+     <b>used/min/max resources do not display info in the scheduler page</b><br>
+     <blockquote>Go to the scheduler page in RM, and click any queue to display the detailed info. You'll find that none of the resources entries (used, min, or max) would display values.
+
+It is because the values contain brackets ("&lt;" and "&gt;") and are not properly html-escaped.</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1033">YARN-1033</a>.
+     Major sub-task reported by Nemon Lou and fixed by Karthik Kambatla <br>
+     <b>Expose RM active/standby state to Web UI and REST API</b><br>
+     <blockquote>Both active and standby RM shall expose it's web server and show it's current state (active or standby) on web page. Users should be able to access this information through the REST API as well.</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1029">YARN-1029</a>.
+     Major sub-task reported by Bikas Saha and fixed by Karthik Kambatla <br>
+     <b>Allow embedding leader election into the RM</b><br>
+     <blockquote>It should be possible to embed common ActiveStandyElector into the RM such that ZooKeeper based leader election and notification is in-built. In conjunction with a ZK state store, this configuration will be a simple deployment option.</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1028">YARN-1028</a>.
+     Major sub-task reported by Bikas Saha and fixed by Karthik Kambatla <br>
+     <b>Add FailoverProxyProvider like capability to RMProxy</b><br>
+     <blockquote>RMProxy layer currently abstracts RM discovery and implements it by looking up service information from configuration. Motivated by HDFS and using existing classes from Common, we can add failover proxy providers that may provide RM discovery in extensible ways.</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1027">YARN-1027</a>.
+     Major sub-task reported by Bikas Saha and fixed by Karthik Kambatla <br>
+     <b>Implement RMHAProtocolService</b><br>
+     <blockquote>Implement existing HAServiceProtocol from Hadoop common. This protocol is the single point of interaction between the RM and HA clients/services.</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1022">YARN-1022</a>.
+     Trivial bug reported by Bikas Saha and fixed by haosdent <br>
+     <b>Unnecessary INFO logs in AMRMClientAsync</b><br>
+     <blockquote>Logs like the following should be debug or else every legitimate stop causes unnecessary exception traces in the logs.
+
+464 2013-08-03 20:01:34,459 INFO [AMRM Heartbeater thread] org.apache.hadoop.yarn.client.api.async.impl.AMRMClientAsyncImpl:            Heartbeater interrupted
+465 java.lang.InterruptedException: sleep interrupted
+466   at java.lang.Thread.sleep(Native Method)
+467   at org.apache.hadoop.yarn.client.api.async.impl.AMRMClientAsyncImpl$HeartbeatThread.run(AMRMClientAsyncImpl.java:249)
+468 2013-08-03 20:01:34,460 INFO [AMRM Callback Handler Thread] org.apache.hadoop.yarn.client.api.async.impl.AMRMClientAsyncImpl:       Interrupted while waiting for queue
+469 java.lang.InterruptedException
+470   at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.reportInterruptAfterWait(AbstractQueuedSynchronizer.     java:1961)
+471   at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:1996)
+472   at java.util.concurrent.LinkedBlockingQueue.take(LinkedBlockingQueue.java:399)
+473   at org.apache.hadoop.yarn.client.api.async.impl.AMRMClientAsyncImpl$CallbackHandlerThread.run(AMRMClientAsyncImpl.java:275)</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1021">YARN-1021</a>.
+     Major new feature reported by Wei Yan and fixed by Wei Yan (scheduler)<br>
+     <b>Yarn Scheduler Load Simulator</b><br>
+     <blockquote>The Yarn Scheduler is a fertile area of interest with different implementations, e.g., Fifo, Capacity and Fair  schedulers. Meanwhile, several optimizations are also made to improve scheduler performance for different scenarios and workload. Each scheduler algorithm has its own set of features, and drives scheduling decisions by many factors, such as fairness, capacity guarantee, resource availability, etc. It is very important to evaluate a scheduler algorithm very well before we deploy it in a production cluster. Unfortunately, currently it is non-trivial to evaluate a scheduling algorithm. Evaluating in a real cluster is always time and cost consuming, and it is also very hard to find a large-enough cluster. Hence, a simulator which can predict how well a scheduler algorithm for some specific workload would be quite useful.
+
+We want to build a Scheduler Load Simulator to simulate large-scale Yarn clusters and application loads in a single machine. This would be invaluable in furthering Yarn by providing a tool for researchers and developers to prototype new scheduler features and predict their behavior and performance with reasonable amount of confidence, there-by aiding rapid innovation.
+
+The simulator will exercise the real Yarn ResourceManager removing the network factor by simulating NodeManagers and ApplicationMasters via handling and dispatching NM/AMs heartbeat events from within the same JVM.
+
+To keep tracking of scheduler behavior and performance, a scheduler wrapper will wrap the real scheduler.
+
+The simulator will produce real time metrics while executing, including:
+
+* Resource usages for whole cluster and each queue, which can be utilized to configure cluster and queue's capacity.
+* The detailed application execution trace (recorded in relation to simulated time), which can be analyzed to understand/validate the  scheduler behavior (individual jobs turn around time, throughput, fairness, capacity guarantee, etc).
+* Several key metrics of scheduler algorithm, such as time cost of each scheduler operation (allocate, handle, etc), which can be utilized by Hadoop developers to find the code spots and scalability limits.
+
+The simulator will provide real time charts showing the behavior of the scheduler and its performance.
+
+A short demo is available http://www.youtube.com/watch?v=6thLi8q0qLE, showing how to use simulator to simulate Fair Scheduler and Capacity Scheduler.</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-1010">YARN-1010</a>.
+     Critical improvement reported by Alejandro Abdelnur and fixed by Wei Yan (scheduler)<br>
+     <b>FairScheduler: decouple container scheduling from nodemanager heartbeats</b><br>
+     <blockquote>Currently scheduling for a node is done when a node heartbeats.
+
+For large cluster where the heartbeat interval is set to several seconds this delays scheduling of incoming allocations significantly.
+
+We could have a continuous loop scanning all nodes and doing scheduling. If there is availability AMs will get the allocation in the next heartbeat after the one that placed the request.</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-985">YARN-985</a>.
+     Major improvement reported by Ravi Prakash and fixed by Ravi Prakash (nodemanager)<br>
+     <b>Nodemanager should log where a resource was localized</b><br>
+     <blockquote>When a resource is localized, we should log WHERE on the local disk it was localized. This helps in debugging afterwards (e.g. if the disk was to go bad).</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-976">YARN-976</a>.
+     Major sub-task reported by Sandy Ryza and fixed by Sandy Ryza (documentation)<br>
+     <b>Document the meaning of a virtual core</b><br>
+     <blockquote>As virtual cores are a somewhat novel concept, it would be helpful to have thorough documentation that clarifies their meaning.</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-895">YARN-895</a>.
+     Major sub-task reported by Jian He and fixed by Jian He (resourcemanager)<br>
+     <b>RM crashes if it restarts while the state-store is down</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-891">YARN-891</a>.
+     Major sub-task reported by Bikas Saha and fixed by Jian He (resourcemanager)<br>
+     <b>Store completed application information in RM state store</b><br>
+     <blockquote>Store completed application/attempt info in RMStateStore when application/attempt completes. This solves some problems like finished application get lost after RM restart and some other races like YARN-1195</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-888">YARN-888</a>.
+     Major bug reported by Alejandro Abdelnur and fixed by Alejandro Abdelnur <br>
+     <b>clean up POM dependencies</b><br>
+     <blockquote>Intermediate 'pom' modules define dependencies inherited by leaf modules.
+
+This is causing issues in intellij IDE.
+
+We should normalize the leaf modules like in common, hdfs and tools where all dependencies are defined in each leaf module and the intermediate 'pom' module do not define any dependency.</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-879">YARN-879</a>.
+     Major bug reported by Junping Du and fixed by Junping Du <br>
+     <b>Fix tests w.r.t o.a.h.y.server.resourcemanager.Application</b><br>
+     <blockquote>getResources() will return a list of containers that allocated by RM. However, it is now return null directly. The worse thing is: if LOG.debug is enabled, then it will definitely cause NPE exception.</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-819">YARN-819</a>.
+     Major sub-task reported by Robert Parker and fixed by Robert Parker (nodemanager , resourcemanager)<br>
+     <b>ResourceManager and NodeManager should check for a minimum allowed version</b><br>
+     <blockquote>Our use case is during upgrade on a large cluster several NodeManagers may not restart with the new version.  Once the RM comes back up the NodeManager will re-register without issue to the RM.
+
+The NM should report the version the RM.  The RM should have a configuration to disallow the check (default), equal to the RM (to prevent config change for each release), equal to or greater than RM (to allow NM upgrades), and finally an explicit version or version range.
+
+The RM should also have an configuration on how to treat the mismatch: REJECT, or REBOOT the NM.</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-807">YARN-807</a>.
+     Major improvement reported by Sandy Ryza and fixed by Sandy Ryza <br>
+     <b>When querying apps by queue, iterating over all apps is inefficient and limiting </b><br>
+     <blockquote>The question "which apps are in queue x" can be asked via the RM REST APIs, through the ClientRMService, and through the command line.  In all these cases, the question is answered by scanning through every RMApp and filtering by the app's queue name.
+
+All schedulers maintain a mapping of queues to applications.  I think it would make more sense to ask the schedulers which applications are in a given queue. This is what was done in MR1. This would also have the advantage of allowing a parent queue to return all the applications on leaf queues under it, and allow queue name aliases, as in the way that "root.default" and "default" refer to the same queue in the fair scheduler.
+
+</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-786">YARN-786</a>.
+     Major improvement reported by Sandy Ryza and fixed by Sandy Ryza <br>
+     <b>Expose application resource usage in RM REST API</b><br>
+     <blockquote>It might be good to require users to explicitly ask for this information, as it's a little more expensive to collect than the other fields in AppInfo.</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-764">YARN-764</a>.
+     Major bug reported by Nemon Lou and fixed by Nemon Lou (resourcemanager)<br>
+     <b>blank Used Resources on Capacity Scheduler page </b><br>
+     <blockquote>Even when there are jobs running,used resources is empty on Capacity Scheduler page for leaf queue.(I use google-chrome on windows 7.)
+After changing resource.java's toString method by replacing "&lt;&gt;" with "{}",this bug gets fixed.</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-709">YARN-709</a>.
+     Major sub-task reported by Jian He and fixed by Jian He (resourcemanager)<br>
+     <b>verify that new jobs submitted with old RM delegation tokens after RM restart are accepted</b><br>
+     <blockquote>More elaborate test for restoring RM delegation tokens on RM restart.
+New jobs with old RM delegation tokens should be accepted by new RM as long as the token is still valid</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-674">YARN-674</a>.
+     Major sub-task reported by Vinod Kumar Vavilapalli and fixed by Omkar Vinit Joshi (resourcemanager)<br>
+     <b>Slow or failing DelegationToken renewals on submission itself make RM unavailable</b><br>
+     <blockquote>This was caused by YARN-280. A slow or a down NameNode for will make it look like RM is unavailable as it may run out of RPC handlers due to blocked client submissions.</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-649">YARN-649</a>.
+     Major sub-task reported by Sandy Ryza and fixed by Sandy Ryza (nodemanager)<br>
+     <b>Make container logs available over HTTP in plain text</b><br>
+     <blockquote>It would be good to make container logs available over the REST API for MAPREDUCE-4362 and so that they can be accessed programatically in general.</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-584">YARN-584</a>.
+     Major bug reported by Sandy Ryza and fixed by Harshit Daga (scheduler)<br>
+     <b>In scheduler web UIs, queues unexpand on refresh</b><br>
+     <blockquote>In the fair scheduler web UI, you can expand queue information.  Refreshing the page causes the expansions to go away, which is annoying for someone who wants to monitor the scheduler page and needs to reopen all the queues they care about each time.</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-546">YARN-546</a>.
+     Major bug reported by Lohit Vijayarenu and fixed by Sandy Ryza (scheduler)<br>
+     <b>Allow disabling the Fair Scheduler event log</b><br>
+     <blockquote>Hadoop 1.0 supported an option to turn on/off FairScheduler event logging using mapred.fairscheduler.eventlog.enabled. In Hadoop 2.0, it looks like this option has been removed (or not ported?) which causes event logging to be enabled by default and there is no way to turn it off.</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-478">YARN-478</a>.
+     Major sub-task reported by Aleksey Gorshkov and fixed by Aleksey Gorshkov <br>
+     <b>fix coverage org.apache.hadoop.yarn.webapp.log</b><br>
+     <blockquote>fix coverage org.apache.hadoop.yarn.webapp.log
+one patch for trunk, branch-2, branch-0.23</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-465">YARN-465</a>.
+     Major sub-task reported by Aleksey Gorshkov and fixed by Andrey Klochkov <br>
+     <b>fix coverage  org.apache.hadoop.yarn.server.webproxy</b><br>
+     <blockquote>fix coverage  org.apache.hadoop.yarn.server.webproxy
+patch YARN-465-trunk.patch for trunk
+patch YARN-465-branch-2.patch for branch-2
+patch YARN-465-branch-0.23.patch for branch-0.23
+
+There is issue in branch-0.23 . Patch does not creating .keep file.
+For fix it need to run commands:
+
+mkdir yhadoop-common/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/webapps/proxy
+touch yhadoop-common/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/webapps/proxy/.keep 
+</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-461">YARN-461</a>.
+     Major bug reported by Sandy Ryza and fixed by Wei Yan (resourcemanager)<br>
+     <b>Fair scheduler should not accept apps with empty string queue name</b><br>
+     <blockquote>When an app is submitted with "" for the queue, the RMAppManager passes it on like it does with any other string.
+</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-427">YARN-427</a>.
+     Major sub-task reported by Aleksey Gorshkov and fixed by Aleksey Gorshkov <br>
+     <b>Coverage fix for org.apache.hadoop.yarn.server.api.*</b><br>
+     <blockquote>Coverage fix for org.apache.hadoop.yarn.server.api.*
+
+patch YARN-427-trunk.patch for trunk
+patch YARN-427-branch-2.patch for branch-2 and branch-0.23</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-425">YARN-425</a>.
+     Major sub-task reported by Aleksey Gorshkov and fixed by Aleksey Gorshkov <br>
+     <b>coverage fix for yarn api</b><br>
+     <blockquote>coverage fix for yarn api
+patch YARN-425-trunk-a.patch for trunk
+patch YARN-425-branch-2.patch for branch-2
+patch YARN-425-branch-0.23.patch for branch-0.23</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-408">YARN-408</a>.
+     Minor bug reported by Mayank Bansal and fixed by Mayank Bansal (scheduler)<br>
+     <b>Capacity Scheduler delay scheduling should not be disabled by default</b><br>
+     <blockquote>Capacity Scheduler delay scheduling should not be disabled by default.
+Enabling it to number of nodes in one rack.
+
+Thanks,
+Mayank</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-353">YARN-353</a>.
+     Major sub-task reported by Hitesh Shah and fixed by Karthik Kambatla (resourcemanager)<br>
+     <b>Add Zookeeper-based store implementation for RMStateStore</b><br>
+     <blockquote>Add store that write RM state data to ZK
+</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-312">YARN-312</a>.
+     Major sub-task reported by Junping Du and fixed by Junping Du (api)<br>
+     <b>Add updateNodeResource in ResourceManagerAdministrationProtocol</b><br>
+     <blockquote>Add fundamental RPC (ResourceManagerAdministrationProtocol) to support node's resource change. For design detail, please refer parent JIRA: YARN-291.</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-311">YARN-311</a>.
+     Major sub-task reported by Junping Du and fixed by Junping Du (resourcemanager , scheduler)<br>
+     <b>Dynamic node resource configuration: core scheduler changes</b><br>
+     <blockquote>As the first step, we go for resource change on RM side and expose admin APIs (admin protocol, CLI, REST and JMX API) later. In this jira, we will only contain changes in scheduler. 
+The flow to update node's resource and awareness in resource scheduling is: 
+1. Resource update is through admin API to RM and take effect on RMNodeImpl.
+2. When next NM heartbeat for updating status comes, the RMNode's resource change will be aware and the delta resource is added to schedulerNode's availableResource before actual scheduling happens.
+3. Scheduler do resource allocation according to new availableResource in SchedulerNode.
+For more design details, please refer proposal and discussions in parent JIRA: YARN-291.</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-305">YARN-305</a>.
+     Critical bug reported by Lohit Vijayarenu and fixed by Lohit Vijayarenu (resourcemanager)<br>
+     <b>Fair scheduler logs too many "Node offered to app:..." messages</b><br>
+     <blockquote>Running fair scheduler YARN shows that RM has lots of messages like the below.
+{noformat}
+INFO org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.AppSchedulable: Node offered to app: application_1357147147433_0002 reserved: false
+{noformat}
+
+They dont seem to tell much and same line is dumped many times in RM log. It would be good to have it improved with node information or moved to some other logging level with enough debug information</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/YARN-7">YARN-7</a>.
+     Major sub-task reported by Arun C Murthy and fixed by Junping Du <br>
+     <b>Add support for DistributedShell to ask for CPUs along with memory</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/MAPREDUCE-5744">MAPREDUCE-5744</a>.
+     Blocker bug reported by Sangjin Lee and fixed by Gera Shegalov <br>
+     <b>Job hangs because RMContainerAllocator$AssignedRequests.preemptReduce() violates the comparator contract</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/MAPREDUCE-5743">MAPREDUCE-5743</a>.
+     Major bug reported by Ted Yu and fixed by Ted Yu <br>
+     <b>TestRMContainerAllocator is failing</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/MAPREDUCE-5729">MAPREDUCE-5729</a>.
+     Critical bug reported by Karthik Kambatla and fixed by Karthik Kambatla (mrv2)<br>
+     <b>mapred job -list throws NPE</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/MAPREDUCE-5725">MAPREDUCE-5725</a>.
+     Major bug reported by Sandy Ryza and fixed by Sandy Ryza <br>
+     <b>TestNetworkedJob relies on the Capacity Scheduler</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/MAPREDUCE-5724">MAPREDUCE-5724</a>.
+     Critical bug reported by Alejandro Abdelnur and fixed by Alejandro Abdelnur (jobhistoryserver)<br>
+     <b>JobHistoryServer does not start if HDFS is not running</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/MAPREDUCE-5723">MAPREDUCE-5723</a>.
+     Blocker bug reported by Mohammad Kamrul Islam and fixed by Mohammad Kamrul Islam (applicationmaster)<br>
+     <b>MR AM container log can be truncated or empty</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/MAPREDUCE-5694">MAPREDUCE-5694</a>.
+     Major bug reported by Mohammad Kamrul Islam and fixed by Mohammad Kamrul Islam <br>
+     <b>MR AM container syslog is empty  </b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/MAPREDUCE-5693">MAPREDUCE-5693</a>.
+     Major bug reported by Gera Shegalov and fixed by Gera Shegalov (mrv2)<br>
+     <b>Restore MRv1 behavior for log flush</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/MAPREDUCE-5692">MAPREDUCE-5692</a>.
+     Major improvement reported by Gera Shegalov and fixed by Gera Shegalov (mrv2)<br>
+     <b>Add explicit diagnostics when a task attempt is killed due to speculative execution</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/MAPREDUCE-5689">MAPREDUCE-5689</a>.
+     Critical bug reported by Lohit Vijayarenu and fixed by Lohit Vijayarenu <br>
+     <b>MRAppMaster does not preempt reducers when scheduled maps cannot be fulfilled</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/MAPREDUCE-5687">MAPREDUCE-5687</a>.
+     Major test reported by Ted Yu and fixed by Jian He <br>
+     <b>TestYARNRunner#testResourceMgrDelegate fails with NPE after YARN-1446</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/MAPREDUCE-5685">MAPREDUCE-5685</a>.
+     Blocker bug reported by Yi Song and fixed by Yi Song (client)<br>
+     <b>getCacheFiles()  api doesn't work in WrappedReducer.java due to typo</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/MAPREDUCE-5679">MAPREDUCE-5679</a>.
+     Major bug reported by Liyin Liang and fixed by Liyin Liang <br>
+     <b>TestJobHistoryParsing has race condition</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/MAPREDUCE-5674">MAPREDUCE-5674</a>.
+     Major bug reported by Chuan Liu and fixed by Chuan Liu (client)<br>
+     <b>Missing start and finish time in mapred.JobStatus</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/MAPREDUCE-5672">MAPREDUCE-5672</a>.
+     Major improvement reported by Gera Shegalov and fixed by Gera Shegalov (mr-am , mrv2)<br>
+     <b>Provide optional RollingFileAppender for container log4j (syslog)</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/MAPREDUCE-5656">MAPREDUCE-5656</a>.
+     Critical bug reported by Jason Lowe and fixed by Jason Lowe <br>
+     <b>bzip2 codec can drop records when reading data in splits</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/MAPREDUCE-5650">MAPREDUCE-5650</a>.
+     Major bug reported by Gera Shegalov and fixed by Gera Shegalov (mrv2)<br>
+     <b>Job fails when hprof mapreduce.task.profile.map/reduce.params is specified</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/MAPREDUCE-5645">MAPREDUCE-5645</a>.
+     Major bug reported by Jonathan Eagles and fixed by Mit Desai <br>
+     <b>TestFixedLengthInputFormat fails with native libs</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/MAPREDUCE-5640">MAPREDUCE-5640</a>.
+     Trivial improvement reported by Jason Lowe and fixed by Jason Lowe (test)<br>
+     <b>Rename TestLineRecordReader in jobclient module</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/MAPREDUCE-5632">MAPREDUCE-5632</a>.
+     Major test reported by Ted Yu and fixed by Jonathan Eagles <br>
+     <b>TestRMContainerAllocator#testUpdatedNodes fails</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/MAPREDUCE-5631">MAPREDUCE-5631</a>.
+     Major bug reported by Jonathan Eagles and fixed by Jonathan Eagles <br>
+     <b>TestJobEndNotifier.testNotifyRetries fails with Should have taken more than 5 seconds in jdk7</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/MAPREDUCE-5625">MAPREDUCE-5625</a>.
+     Major test reported by Jonathan Eagles and fixed by Mariappan Asokan <br>
+     <b>TestFixedLengthInputFormat fails in jdk7 environment</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/MAPREDUCE-5623">MAPREDUCE-5623</a>.
+     Major bug reported by Tsuyoshi OZAWA and fixed by Jason Lowe <br>
+     <b>TestJobCleanup fails because of RejectedExecutionException and NPE.</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/MAPREDUCE-5616">MAPREDUCE-5616</a>.
+     Major bug reported by Chris Nauroth and fixed by Chris Nauroth (client)<br>
+     <b>MR Client-AppMaster RPC max retries on socket timeout is too high.</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/MAPREDUCE-5613">MAPREDUCE-5613</a>.
+     Major bug reported by Gera Shegalov and fixed by Gera Shegalov (applicationmaster)<br>
+     <b>DefaultSpeculator holds and checks hashmap that is always empty</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/MAPREDUCE-5610">MAPREDUCE-5610</a>.
+     Major test reported by Jonathan Eagles and fixed by Jonathan Eagles <br>
+     <b>TestSleepJob fails in jdk7</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/MAPREDUCE-5604">MAPREDUCE-5604</a>.
+     Minor bug reported by Chris Nauroth and fixed by Chris Nauroth (test)<br>
+     <b>TestMRAMWithNonNormalizedCapabilities fails on Windows due to exceeding max path length</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/MAPREDUCE-5601">MAPREDUCE-5601</a>.
+     Major improvement reported by Sandy Ryza and fixed by Sandy Ryza <br>
+     <b>ShuffleHandler fadvises file regions as DONTNEED even when fetch fails</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/MAPREDUCE-5598">MAPREDUCE-5598</a>.
+     Major bug reported by Robert Kanter and fixed by Robert Kanter (test)<br>
+     <b>TestUserDefinedCounters.testMapReduceJob is flakey</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/MAPREDUCE-5596">MAPREDUCE-5596</a>.
+     Major improvement reported by Sandy Ryza and fixed by Sandy Ryza <br>
+     <b>Allow configuring the number of threads used to serve shuffle connections</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/MAPREDUCE-5587">MAPREDUCE-5587</a>.
+     Major bug reported by Jonathan Eagles and fixed by Jonathan Eagles <br>
+     <b>TestTextOutputFormat fails on JDK7</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/MAPREDUCE-5586">MAPREDUCE-5586</a>.
+     Major bug reported by Jonathan Eagles and fixed by Jonathan Eagles <br>
+     <b>TestCopyMapper#testCopyFailOnBlockSizeDifference fails when run from hadoop-tools/hadoop-distcp directory</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/MAPREDUCE-5585">MAPREDUCE-5585</a>.
+     Major bug reported by Jonathan Eagles and fixed by Jonathan Eagles <br>
+     <b>TestCopyCommitter#testNoCommitAction Fails on JDK7</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/MAPREDUCE-5569">MAPREDUCE-5569</a>.
+     Major bug reported by Nathan Roberts and fixed by Nathan Roberts <br>
+     <b>FloatSplitter is not generating correct splits</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/MAPREDUCE-5561">MAPREDUCE-5561</a>.
+     Critical bug reported by Cindy Li and fixed by Karthik Kambatla <br>
+     <b>org.apache.hadoop.mapreduce.v2.app.job.impl.TestJobImpl testcase failing on trunk</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/MAPREDUCE-5550">MAPREDUCE-5550</a>.
+     Major bug reported by Vrushali C and fixed by Gera Shegalov <br>
+     <b>Task Status message (reporter.setStatus) not shown in UI with Hadoop 2.0</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/MAPREDUCE-5546">MAPREDUCE-5546</a>.
+     Major bug reported by Chuan Liu and fixed by Chuan Liu <br>
+     <b>mapred.cmd on Windows set HADOOP_OPTS incorrectly</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/MAPREDUCE-5522">MAPREDUCE-5522</a>.
+     Minor bug reported by Jinghui Wang and fixed by Jinghui Wang (test)<br>
+     <b>Incorrectly expect the array of JobQueueInfo returned by o.a.h.mapred.QueueManager#getJobQueueInfos to have a specific order.</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/MAPREDUCE-5518">MAPREDUCE-5518</a>.
+     Trivial bug reported by Albert Chu and fixed by Albert Chu (examples)<br>
+     <b>Fix typo "can't read paritions file"</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/MAPREDUCE-5514">MAPREDUCE-5514</a>.
+     Blocker bug reported by Zhijie Shen and fixed by Zhijie Shen <br>
+     <b>TestRMContainerAllocator fails on trunk</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/MAPREDUCE-5504">MAPREDUCE-5504</a>.
+     Major bug reported by Thomas Graves and fixed by Kousuke Saruta (client)<br>
+     <b>mapred queue -info inconsistent with types</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/MAPREDUCE-5487">MAPREDUCE-5487</a>.
+     Major improvement reported by Sandy Ryza and fixed by Sandy Ryza (performance , task)<br>
+     <b>In task processes, JobConf is unnecessarily loaded again in Limits</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/MAPREDUCE-5484">MAPREDUCE-5484</a>.
+     Major improvement reported by Sandy Ryza and fixed by Sandy Ryza (task)<br>
+     <b>YarnChild unnecessarily loads job conf twice</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/MAPREDUCE-5481">MAPREDUCE-5481</a>.
+     Blocker bug reported by Jason Lowe and fixed by Sandy Ryza (mrv2 , test)<br>
+     <b>Enable uber jobs to have multiple reducers </b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/MAPREDUCE-5464">MAPREDUCE-5464</a>.
+     Major task reported by Sandy Ryza and fixed by Sandy Ryza <br>
+     <b>Add analogs of the SLOTS_MILLIS counters that jive with the YARN resource model</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/MAPREDUCE-5463">MAPREDUCE-5463</a>.
+     Major task reported by Sandy Ryza and fixed by Tsuyoshi OZAWA <br>
+     <b>Deprecate SLOTS_MILLIS counters</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/MAPREDUCE-5457">MAPREDUCE-5457</a>.
+     Major improvement reported by Sandy Ryza and fixed by Sandy Ryza <br>
+     <b>Add a KeyOnlyTextOutputReader to enable streaming to write out text files without separators</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/MAPREDUCE-5451">MAPREDUCE-5451</a>.
+     Major bug reported by Mostafa Elhemali and fixed by Yingda Chen <br>
+     <b>MR uses LD_LIBRARY_PATH which doesn't mean anything in Windows</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/MAPREDUCE-5431">MAPREDUCE-5431</a>.
+     Major bug reported by Timothy St. Clair and fixed by Timothy St. Clair (build)<br>
+     <b>Missing pom dependency in MR-client</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/MAPREDUCE-5411">MAPREDUCE-5411</a>.
+     Major sub-task reported by Ashwin Shankar and fixed by Ashwin Shankar (jobhistoryserver)<br>
+     <b>Refresh size of loaded job cache on history server</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/MAPREDUCE-5409">MAPREDUCE-5409</a>.
+     Major sub-task reported by Devaraj K and fixed by Gera Shegalov <br>
+     <b>MRAppMaster throws InvalidStateTransitonException: Invalid event: TA_TOO_MANY_FETCH_FAILURE at KILLED for TaskAttemptImpl</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/MAPREDUCE-5404">MAPREDUCE-5404</a>.
+     Major bug reported by Ted Yu and fixed by Ted Yu (jobhistoryserver)<br>
+     <b>HSAdminServer does not use ephemeral ports in minicluster mode</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/MAPREDUCE-5386">MAPREDUCE-5386</a>.
+     Major sub-task reported by Ashwin Shankar and fixed by Ashwin Shankar (jobhistoryserver)<br>
+     <b>Ability to refresh history server job retention and job cleaner settings</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/MAPREDUCE-5380">MAPREDUCE-5380</a>.
+     Major bug reported by Stephen Chu and fixed by Stephen Chu <br>
+     <b>Invalid mapred command should return non-zero exit code</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/MAPREDUCE-5373">MAPREDUCE-5373</a>.
+     Major bug reported by Chuan Liu and fixed by Jonathan Eagles <br>
+     <b>TestFetchFailure.testFetchFailureMultipleReduces could fail intermittently</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/MAPREDUCE-5356">MAPREDUCE-5356</a>.
+     Major sub-task reported by Ashwin Shankar and fixed by Ashwin Shankar (jobhistoryserver)<br>
+     <b>Ability to refresh aggregated log retention period and check interval </b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/MAPREDUCE-5332">MAPREDUCE-5332</a>.
+     Major new feature reported by Jason Lowe and fixed by Jason Lowe (jobhistoryserver)<br>
+     <b>Support token-preserving restart of history server</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/MAPREDUCE-5329">MAPREDUCE-5329</a>.
+     Major bug reported by Avner BenHanoch and fixed by Avner BenHanoch (mr-am)<br>
+     <b>APPLICATION_INIT is never sent to AuxServices other than the builtin ShuffleHandler</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/MAPREDUCE-5316">MAPREDUCE-5316</a>.
+     Major bug reported by Ashwin Shankar and fixed by Ashwin Shankar (client)<br>
+     <b>job -list-attempt-ids command does not handle illegal task-state</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/MAPREDUCE-5266">MAPREDUCE-5266</a>.
+     Major new feature reported by Jason Lowe and fixed by Ashwin Shankar (jobhistoryserver)<br>
+     <b>Ability to refresh retention settings on history server</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/MAPREDUCE-5265">MAPREDUCE-5265</a>.
+     Major new feature reported by Jason Lowe and fixed by Ashwin Shankar (jobhistoryserver)<br>
+     <b>History server admin service to refresh user and superuser group mappings</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/MAPREDUCE-5186">MAPREDUCE-5186</a>.
+     Critical bug reported by Sangjin Lee and fixed by Robert Parker (job submission)<br>
+     <b>mapreduce.job.max.split.locations causes some splits created by CombineFileInputFormat to fail</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/MAPREDUCE-5102">MAPREDUCE-5102</a>.
+     Major test reported by Aleksey Gorshkov and fixed by Andrey Klochkov <br>
+     <b>fix coverage  org.apache.hadoop.mapreduce.lib.db and org.apache.hadoop.mapred.lib.db</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/MAPREDUCE-5084">MAPREDUCE-5084</a>.
+     Major test reported by Aleksey Gorshkov and fixed by Aleksey Gorshkov <br>
+     <b>fix coverage  org.apache.hadoop.mapreduce.v2.app.webapp and org.apache.hadoop.mapreduce.v2.hs.webapp</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/MAPREDUCE-5052">MAPREDUCE-5052</a>.
+     Critical bug reported by Kendall Thrapp and fixed by Chen He (jobhistoryserver , webapps)<br>
+     <b>Job History UI and web services confusing job start time and job submit time</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/MAPREDUCE-5020">MAPREDUCE-5020</a>.
+     Major bug reported by Trevor Robinson and fixed by Trevor Robinson (client)<br>
+     <b>Compile failure with JDK8</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/MAPREDUCE-4680">MAPREDUCE-4680</a>.
+     Major bug reported by Sandy Ryza and fixed by Robert Kanter (jobhistoryserver)<br>
+     <b>Job history cleaner should only check timestamps of files in old enough directories</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/MAPREDUCE-4421">MAPREDUCE-4421</a>.
+     Major improvement reported by Arun C Murthy and fixed by Jason Lowe <br>
+     <b>Run MapReduce framework via the distributed cache</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/MAPREDUCE-3310">MAPREDUCE-3310</a>.
+     Major improvement reported by Mathias Herberts and fixed by Alejandro Abdelnur (client)<br>
+     <b>Custom grouping comparator cannot be set for Combiners</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/MAPREDUCE-1176">MAPREDUCE-1176</a>.
+     Major new feature reported by BitsOfInfo and fixed by Mariappan Asokan <br>
+     <b>FixedLengthInputFormat and FixedLengthRecordReader</b><br>
+     <blockquote>Addition of FixedLengthInputFormat and FixedLengthRecordReader in the org.apache.hadoop.mapreduce.lib.input package. These two classes can be used when you need to read data from files containing fixed length (fixed width) records. Such files have no CR/LF (or any combination thereof), no delimiters etc, but each record is a fixed length, and extra data is padded with spaces. The data is one gigantic line within a file. When creating a job that specifies this input format, the job must have the "mapreduce.input.fixedlengthinputformat.record.length" property set as follows myJobConf.setInt("mapreduce.input.fixedlengthinputformat.record.length",[myFixedRecordLength]); 
+
+Please see javadoc for more details.</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/MAPREDUCE-434">MAPREDUCE-434</a>.
+     Minor improvement reported by Yoram Arnon and fixed by Aaron Kimball <br>
+     <b>LocalJobRunner limited to single reducer</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5921">HDFS-5921</a>.
+     Critical bug reported by Aaron T. Myers and fixed by Aaron T. Myers (namenode)<br>
+     <b>Cannot browse file system via NN web UI if any directory has the sticky bit set</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5876">HDFS-5876</a>.
+     Major bug reported by Haohui Mai and fixed by Haohui Mai (datanode)<br>
+     <b>SecureDataNodeStarter does not pick up configuration in hdfs-site.xml</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5873">HDFS-5873</a>.
+     Major bug reported by Yesha Vora and fixed by Haohui Mai <br>
+     <b>dfs.http.policy should have higher precedence over dfs.https.enable</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5845">HDFS-5845</a>.
+     Blocker bug reported by Andrew Wang and fixed by Andrew Wang (namenode)<br>
+     <b>SecondaryNameNode dies when checkpointing with cache pools</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5844">HDFS-5844</a>.
+     Minor bug reported by Akira AJISAKA and fixed by Akira AJISAKA (documentation)<br>
+     <b>Fix broken link in WebHDFS.apt.vm</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5842">HDFS-5842</a>.
+     Major bug reported by Arpit Gupta and fixed by Jing Zhao (security)<br>
+     <b>Cannot create hftp filesystem when using a proxy user ugi and a doAs on a secure cluster</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5841">HDFS-5841</a>.
+     Major improvement reported by Andrew Wang and fixed by Andrew Wang <br>
+     <b>Update HDFS caching documentation with new changes</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5837">HDFS-5837</a>.
+     Major bug reported by Bryan Beaudreault and fixed by Tao Luo (namenode)<br>
+     <b>dfs.namenode.replication.considerLoad does not consider decommissioned nodes</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5833">HDFS-5833</a>.
+     Trivial improvement reported by Bangtao Zhou and fixed by  (namenode)<br>
+     <b>SecondaryNameNode have an incorrect java doc</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5830">HDFS-5830</a>.
+     Blocker bug reported by Yongjun Zhang and fixed by Yongjun Zhang (caching , hdfs-client)<br>
+     <b>WebHdfsFileSystem.getFileBlockLocations throws IllegalArgumentException when accessing another cluster. </b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5825">HDFS-5825</a>.
+     Minor improvement reported by Haohui Mai and fixed by Haohui Mai <br>
+     <b>Use FileUtils.copyFile() to implement DFSTestUtils.copyFile()</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5806">HDFS-5806</a>.
+     Major bug reported by Nathan Roberts and fixed by Nathan Roberts (balancer)<br>
+     <b>balancer should set SoTimeout to avoid indefinite hangs</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5800">HDFS-5800</a>.
+     Trivial bug reported by Kousuke Saruta and fixed by Kousuke Saruta (hdfs-client)<br>
+     <b>Typo: soft-limit for hard-limit in DFSClient</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5789">HDFS-5789</a>.
+     Major bug reported by Uma Maheswara Rao G and fixed by Uma Maheswara Rao G (namenode)<br>
+     <b>Some of snapshot APIs missing checkOperation double check in fsn</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5788">HDFS-5788</a>.
+     Major improvement reported by Nathan Roberts and fixed by Nathan Roberts (namenode)<br>
+     <b>listLocatedStatus response can be very large</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5784">HDFS-5784</a>.
+     Major sub-task reported by Colin Patrick McCabe and fixed by Colin Patrick McCabe (namenode)<br>
+     <b>reserve space in edit log header and fsimage header for feature flag section</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5777">HDFS-5777</a>.
+     Major bug reported by Jing Zhao and fixed by Jing Zhao (namenode)<br>
+     <b>Update LayoutVersion for the new editlog op OP_ADD_BLOCK</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5766">HDFS-5766</a>.
+     Major bug reported by Liang Xie and fixed by Liang Xie (hdfs-client)<br>
+     <b>In DFSInputStream, do not add datanode to deadNodes after InvalidEncryptionKeyException in fetchBlockByteRange</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5762">HDFS-5762</a>.
+     Major bug reported by Colin Patrick McCabe and fixed by Colin Patrick McCabe <br>
+     <b>BlockReaderLocal doesn't return -1 on EOF when doing zero-length reads</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5756">HDFS-5756</a>.
+     Major bug reported by Colin Patrick McCabe and fixed by Colin Patrick McCabe (libhdfs)<br>
+     <b>hadoopRzOptionsSetByteBufferPool does not accept NULL argument, contrary to docs</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5748">HDFS-5748</a>.
+     Major improvement reported by Kihwal Lee and fixed by Haohui Mai <br>
+     <b>Too much information shown in the dfs health page.</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5747">HDFS-5747</a>.
+     Minor bug reported by Tsz Wo (Nicholas), SZE and fixed by Arpit Agarwal (namenode)<br>
+     <b>BlocksMap.getStoredBlock(..) and BlockInfoUnderConstruction.addReplicaIfNotPresent(..) may throw NullPointerException</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5728">HDFS-5728</a>.
+     Critical bug reported by Vinayakumar B and fixed by Vinayakumar B (datanode)<br>
+     <b>[Diskfull] Block recovery will fail if the metafile does not have crc for all chunks of the block</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5721">HDFS-5721</a>.
+     Minor improvement reported by Ted Yu and fixed by Ted Yu <br>
+     <b>sharedEditsImage in Namenode#initializeSharedEdits() should be closed before method returns</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5719">HDFS-5719</a>.
+     Minor bug reported by Ted Yu and fixed by Ted Yu (namenode)<br>
+     <b>FSImage#doRollback() should close prevState before return</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5710">HDFS-5710</a>.
+     Major bug reported by Ted Yu and fixed by Uma Maheswara Rao G <br>
+     <b>FSDirectory#getFullPathName should check inodes against null</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5704">HDFS-5704</a>.
+     Major bug reported by Suresh Srinivas and fixed by Jing Zhao (namenode)<br>
+     <b>Change OP_UPDATE_BLOCKS  with a new OP_ADD_BLOCK</b><br>
+     <blockquote>Add a new editlog record (OP_ADD_BLOCK) that only records allocation of the new block instead of the entire block list, on every block allocation.</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5703">HDFS-5703</a>.
+     Major new feature reported by Alejandro Abdelnur and fixed by Alejandro Abdelnur (webhdfs)<br>
+     <b>Add support for HTTPS and swebhdfs to HttpFS</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5695">HDFS-5695</a>.
+     Major improvement reported by Haohui Mai and fixed by Haohui Mai (test)<br>
+     <b>Clean up TestOfflineEditsViewer and OfflineEditsViewerHelper</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5691">HDFS-5691</a>.
+     Minor bug reported by Akira AJISAKA and fixed by Akira AJISAKA (documentation)<br>
+     <b>Fix typo in ShortCircuitLocalRead document</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5690">HDFS-5690</a>.
+     Blocker bug reported by Haohui Mai and fixed by Haohui Mai <br>
+     <b>DataNode fails to start in secure mode when dfs.http.policy equals to HTTP_ONLY</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5681">HDFS-5681</a>.
+     Major bug reported by Daryn Sharp and fixed by Daryn Sharp (namenode)<br>
+     <b>renewLease should not hold fsn write lock</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5677">HDFS-5677</a>.
+     Minor improvement reported by Vincent Sheffer and fixed by Vincent Sheffer (datanode , ha)<br>
+     <b>Need error checking for HA cluster configuration</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5676">HDFS-5676</a>.
+     Minor improvement reported by Colin Patrick McCabe and fixed by Colin Patrick McCabe (hdfs-client)<br>
+     <b>fix inconsistent synchronization of CachingStrategy</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5675">HDFS-5675</a>.
+     Minor bug reported by Plamen Jeliazkov and fixed by Plamen Jeliazkov (benchmarks)<br>
+     <b>Add Mkdirs operation to NNThroughputBenchmark</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5674">HDFS-5674</a>.
+     Minor improvement reported by Tsz Wo (Nicholas), SZE and fixed by Tsz Wo (Nicholas), SZE (namenode)<br>
+     <b>Editlog code cleanup</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5671">HDFS-5671</a>.
+     Critical bug reported by JamesLi and fixed by JamesLi (hdfs-client)<br>
+     <b>Fix socket leak in DFSInputStream#getBlockReader</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5667">HDFS-5667</a>.
+     Major sub-task reported by Eric Sirianni and fixed by Arpit Agarwal (datanode)<br>
+     <b>Include DatanodeStorage in StorageReport</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5666">HDFS-5666</a>.
+     Minor bug reported by Colin Patrick McCabe and fixed by Jimmy Xiang (namenode)<br>
+     <b>Fix inconsistent synchronization in BPOfferService</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5663">HDFS-5663</a>.
+     Major improvement reported by Liang Xie and fixed by Liang Xie (hdfs-client)<br>
+     <b>make the retry time and interval value configurable in openInfo()</b><br>
+     <blockquote>Makes the retries and time between retries getting the length of the last block on file configurable.  Below are the new configurations.
+
+dfs.client.retry.times.get-last-block-length
+dfs.client.retry.interval-ms.get-last-block-length
+
+They are set to the 3 and 4000 respectively, these being what was previously hardcoded.
+
+</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5662">HDFS-5662</a>.
+     Major improvement reported by Brandon Li and fixed by Brandon Li (namenode)<br>
+     <b>Can't decommission a DataNode due to file's replication factor larger than the rest of the cluster size</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5661">HDFS-5661</a>.
+     Major bug reported by Benoy Antony and fixed by Benoy Antony <br>
+     <b>Browsing FileSystem via web ui, should use datanode's fqdn instead of ip address</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5657">HDFS-5657</a>.
+     Major bug reported by Brandon Li and fixed by Brandon Li (nfs)<br>
+     <b>race condition causes writeback state error in NFS gateway</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5652">HDFS-5652</a>.
+     Minor improvement reported by Liang Xie and fixed by Liang Xie (hdfs-client)<br>
+     <b>refactoring/uniforming invalid block token exception handling in DFSInputStream</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5649">HDFS-5649</a>.
+     Major bug reported by Brandon Li and fixed by Brandon Li (nfs)<br>
+     <b>Unregister NFS and Mount service when NFS gateway is shutting down</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5637">HDFS-5637</a>.
+     Major improvement reported by Liang Xie and fixed by Liang Xie (hdfs-client , security)<br>
+     <b>try to refeatchToken while local read InvalidToken occurred</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5634">HDFS-5634</a>.
+     Major sub-task reported by Colin Patrick McCabe and fixed by Colin Patrick McCabe (hdfs-client)<br>
+     <b>allow BlockReaderLocal to switch between checksumming and not</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5633">HDFS-5633</a>.
+     Minor improvement reported by Jing Zhao and fixed by Jing Zhao <br>
+     <b>Improve OfflineImageViewer to use less memory</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5629">HDFS-5629</a>.
+     Major sub-task reported by Haohui Mai and fixed by Haohui Mai <br>
+     <b>Support HTTPS in JournalNode and SecondaryNameNode</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5592">HDFS-5592</a>.
+     Major bug reported by Vinayakumar B and fixed by Vinayakumar B <br>
+     <b>"DIR* completeFile: /file is closed by DFSClient_" should be logged only for successful closure of the file.</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5590">HDFS-5590</a>.
+     Major bug reported by Jing Zhao and fixed by Jing Zhao <br>
+     <b>Block ID and generation stamp may be reused when persistBlocks is set to false</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5587">HDFS-5587</a>.
+     Minor improvement reported by Brandon Li and fixed by Brandon Li (nfs)<br>
+     <b>add debug information when NFS fails to start with duplicate user or group names</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5582">HDFS-5582</a>.
+     Minor bug reported by Henry Hung and fixed by sathish <br>
+     <b>hdfs getconf -excludeFile or -includeFile always failed</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5581">HDFS-5581</a>.
+     Major bug reported by Vinayakumar B and fixed by Vinayakumar B (namenode)<br>
+     <b>NameNodeFsck should use only one instance of BlockPlacementPolicy</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5580">HDFS-5580</a>.
+     Major bug reported by Binglin Chang and fixed by Binglin Chang <br>
+     <b>Infinite loop in Balancer.waitForMoveCompletion</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5579">HDFS-5579</a>.
+     Major bug reported by zhaoyunjiong and fixed by zhaoyunjiong (namenode)<br>
+     <b>Under construction files make DataNode decommission take very long hours</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5577">HDFS-5577</a>.
+     Trivial improvement reported by Brandon Li and fixed by Brandon Li (documentation)<br>
+     <b>NFS user guide update</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5568">HDFS-5568</a>.
+     Major improvement reported by Vinayakumar B and fixed by Vinayakumar B (snapshots)<br>
+     <b>Support inclusion of snapshot paths in Namenode fsck</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5563">HDFS-5563</a>.
+     Major improvement reported by Brandon Li and fixed by Brandon Li (nfs)<br>
+     <b>NFS gateway should commit the buffered data when read request comes after write to the same file</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5561">HDFS-5561</a>.
+     Minor improvement reported by Fengdong Yu and fixed by Haohui Mai (namenode)<br>
+     <b>FSNameSystem#getNameJournalStatus() in JMX should return plain text instead of HTML</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5560">HDFS-5560</a>.
+     Major bug reported by Josh Elser and fixed by Josh Elser <br>
+     <b>Trash configuration log statements prints incorrect units</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5558">HDFS-5558</a>.
+     Major bug reported by Kihwal Lee and fixed by Kihwal Lee <br>
+     <b>LeaseManager monitor thread can crash if the last block is complete but another block is not.</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5557">HDFS-5557</a>.
+     Critical bug reported by Kihwal Lee and fixed by Kihwal Lee <br>
+     <b>Write pipeline recovery for the last packet in the block may cause rejection of valid replicas</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5552">HDFS-5552</a>.
+     Major bug reported by Shinichi Yamashita and fixed by Haohui Mai (namenode)<br>
+     <b>Fix wrong information of "Cluster summay" in dfshealth.html</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5548">HDFS-5548</a>.
+     Major improvement reported by Haohui Mai and fixed by Haohui Mai (nfs)<br>
+     <b>Use ConcurrentHashMap in portmap</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5545">HDFS-5545</a>.
+     Major sub-task reported by Haohui Mai and fixed by Haohui Mai <br>
+     <b>Allow specifying endpoints for listeners in HttpServer</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5544">HDFS-5544</a>.
+     Minor bug reported by sathish and fixed by sathish (hdfs-client)<br>
+     <b>Adding Test case For Checking dfs.checksum type as NULL value</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5540">HDFS-5540</a>.
+     Minor bug reported by Binglin Chang and fixed by Binglin Chang <br>
+     <b>Fix intermittent failure in TestBlocksWithNotEnoughRacks</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5538">HDFS-5538</a>.
+     Major sub-task reported by Haohui Mai and fixed by Haohui Mai <br>
+     <b>URLConnectionFactory should pick up the SSL related configuration by default</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5536">HDFS-5536</a>.
+     Major sub-task reported by Haohui Mai and fixed by Haohui Mai <br>
+     <b>Implement HTTP policy for Namenode and DataNode</b><br>
+     <blockquote>Add new HTTP policy configuration. Users can use "dfs.http.policy" to control the HTTP endpoints for NameNode and DataNode. Specifically, The following values are supported:
+- HTTP_ONLY : Service is provided only on http
+- HTTPS_ONLY : Service is provided only on https
+- HTTP_AND_HTTPS : Service is provided both on http and https
+
+hadoop.ssl.enabled and dfs.https.enabled are deprecated. When the deprecated configuration properties are still configured, currently http policy is decided based on the following rules:
+1. If dfs.http.policy is set to HTTPS_ONLY or HTTP_AND_HTTPS. It picks the specified policy, otherwise it proceeds to 2~4.
+2. It picks HTTPS_ONLY if hadoop.ssl.enabled equals to true.
+3. It picks HTTP_AND_HTTPS if dfs.https.enable equals to true.
+4. It picks HTTP_ONLY for other configurations.</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5533">HDFS-5533</a>.
+     Minor bug reported by Binglin Chang and fixed by Binglin Chang (snapshots)<br>
+     <b>Symlink delete/create should be treated as DELETE/CREATE in snapshot diff report</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5532">HDFS-5532</a>.
+     Major improvement reported by Vinayakumar B and fixed by Vinayakumar B (webhdfs)<br>
+     <b>Enable the webhdfs by default to support new HDFS web UI</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5526">HDFS-5526</a>.
+     Blocker bug reported by Tsz Wo (Nicholas), SZE and fixed by Kihwal Lee (datanode)<br>
+     <b>Datanode cannot roll back to previous layout version</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5525">HDFS-5525</a>.
+     Major sub-task reported by Haohui Mai and fixed by Haohui Mai <br>
+     <b>Inline dust templates</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5519">HDFS-5519</a>.
+     Minor sub-task reported by Brandon Li and fixed by Brandon Li (nfs)<br>
+     <b>COMMIT handler should update the commit status after sync</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5514">HDFS-5514</a>.
+     Major sub-task reported by Daryn Sharp and fixed by Daryn Sharp (namenode)<br>
+     <b>FSNamesystem's fsLock should allow custom implementation</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5506">HDFS-5506</a>.
+     Major sub-task reported by Haohui Mai and fixed by Haohui Mai <br>
+     <b>Use URLConnectionFactory in DelegationTokenFetcher</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5504">HDFS-5504</a>.
+     Major bug reported by Vinayakumar B and fixed by Vinayakumar B (snapshots)<br>
+     <b>In HA mode, OP_DELETE_SNAPSHOT is not decrementing the safemode threshold, leads to NN safemode.</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5502">HDFS-5502</a>.
+     Major sub-task reported by Haohui Mai and fixed by Haohui Mai <br>
+     <b>Fix HTTPS support in HsftpFileSystem</b><br>
+     <blockquote>Fix the https support in HsftpFileSystem. With the change the client now verifies the server certificate. In particular, client side will verify the Common Name of the certificate using a strategy specified by the configuration property "hadoop.ssl.hostname.verifier".</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5495">HDFS-5495</a>.
+     Major improvement reported by Andrew Wang and fixed by Jarek Jarcec Cecho <br>
+     <b>Remove further JUnit3 usages from HDFS</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5489">HDFS-5489</a>.
+     Major sub-task reported by Haohui Mai and fixed by Haohui Mai <br>
+     <b>Use TokenAspect in WebHDFSFileSystem</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5488">HDFS-5488</a>.
+     Major sub-task reported by Haohui Mai and fixed by Haohui Mai <br>
+     <b>Clean up TestHftpURLTimeout</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5487">HDFS-5487</a>.
+     Major sub-task reported by Haohui Mai and fixed by Haohui Mai <br>
+     <b>Introduce unit test for TokenAspect</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5476">HDFS-5476</a>.
+     Major bug reported by Jing Zhao and fixed by Jing Zhao <br>
+     <b>Snapshot: clean the blocks/files/directories under a renamed file/directory while deletion</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5474">HDFS-5474</a>.
+     Major bug reported by Uma Maheswara Rao G and fixed by sathish (snapshots)<br>
+     <b>Deletesnapshot can make Namenode in safemode on NN restarts.</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5469">HDFS-5469</a>.
+     Major sub-task reported by Brandon Li and fixed by Brandon Li (nfs)<br>
+     <b>Add configuration property for the sub-directroy export path</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5467">HDFS-5467</a>.
+     Trivial improvement reported by Andrew Wang and fixed by Shinichi Yamashita <br>
+     <b>Remove tab characters in hdfs-default.xml</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5458">HDFS-5458</a>.
+     Major bug reported by Andrew Wang and fixed by Mike Mellenthin (datanode)<br>
+     <b>Datanode failed volume threshold ignored if exception is thrown in getDataDirsFromURIs</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5456">HDFS-5456</a>.
+     Critical bug reported by Chris Nauroth and fixed by Chris Nauroth (namenode)<br>
+     <b>NameNode startup progress creates new steps if caller attempts to create a counter for a step that doesn't already exist.</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5454">HDFS-5454</a>.
+     Minor sub-task reported by Eric Sirianni and fixed by Arpit Agarwal (datanode)<br>
+     <b>DataNode UUID should be assigned prior to FsDataset initialization</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5449">HDFS-5449</a>.
+     Blocker bug reported by Kihwal Lee and fixed by Kihwal Lee <br>
+     <b>WebHdfs compatibility broken between 2.2 and 1.x / 23.x</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5444">HDFS-5444</a>.
+     Major sub-task reported by Haohui Mai and fixed by Haohui Mai <br>
+     <b>Choose default web UI based on browser capabilities</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5443">HDFS-5443</a>.
+     Major bug reported by Uma Maheswara Rao G and fixed by Jing Zhao (snapshots)<br>
+     <b>Delete 0-sized block when deleting an under-construction file that is included in snapshot</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5440">HDFS-5440</a>.
+     Major sub-task reported by Haohui Mai and fixed by Haohui Mai <br>
+     <b>Extract the logic of handling delegation tokens in HftpFileSystem to the TokenAspect class</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5438">HDFS-5438</a>.
+     Critical bug reported by Kihwal Lee and fixed by Kihwal Lee (namenode)<br>
+     <b>Flaws in block report processing can cause data loss</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5436">HDFS-5436</a>.
+     Major sub-task reported by Haohui Mai and fixed by Haohui Mai <br>
+     <b>Move HsFtpFileSystem and HFtpFileSystem into org.apache.hdfs.web</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5434">HDFS-5434</a>.
+     Minor bug reported by Buddy and fixed by  (namenode)<br>
+     <b>Write resiliency for replica count 1</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5433">HDFS-5433</a>.
+     Critical bug reported by Aaron T. Myers and fixed by Aaron T. Myers (snapshots)<br>
+     <b>When reloading fsimage during checkpointing, we should clear existing snapshottable directories</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5432">HDFS-5432</a>.
+     Trivial bug reported by Chris Nauroth and fixed by Chris Nauroth (datanode , test)<br>
+     <b>TestDatanodeJsp fails on Windows due to assumption that loopback address resolves to host name localhost.</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5428">HDFS-5428</a>.
+     Major bug reported by Vinayakumar B and fixed by Jing Zhao (snapshots)<br>
+     <b>under construction files deletion after snapshot+checkpoint+nn restart leads nn safemode</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5427">HDFS-5427</a>.
+     Major bug reported by Vinayakumar B and fixed by Vinayakumar B (snapshots)<br>
+     <b>not able to read deleted files from snapshot directly under snapshottable dir after checkpoint and NN restart</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5425">HDFS-5425</a>.
+     Major bug reported by sathish and fixed by Jing Zhao (namenode , snapshots)<br>
+     <b>Renaming underconstruction file with snapshots can make NN failure on restart</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5413">HDFS-5413</a>.
+     Major bug reported by Chris Nauroth and fixed by Chris Nauroth (scripts)<br>
+     <b>hdfs.cmd does not support passthrough to any arbitrary class.</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5407">HDFS-5407</a>.
+     Trivial bug reported by Haohui Mai and fixed by Haohui Mai <br>
+     <b>Fix typos in DFSClientCache</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5406">HDFS-5406</a>.
+     Major sub-task reported by Arpit Agarwal and fixed by Arpit Agarwal (datanode)<br>
+     <b>Send incremental block reports for all storages in a single call</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5403">HDFS-5403</a>.
+     Major bug reported by Aaron T. Myers and fixed by Aaron T. Myers (webhdfs)<br>
+     <b>WebHdfs client cannot communicate with older WebHdfs servers post HDFS-5306</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5400">HDFS-5400</a>.
+     Major bug reported by Colin Patrick McCabe and fixed by Colin Patrick McCabe <br>
+     <b>DFS_CLIENT_MMAP_CACHE_THREAD_RUNS_PER_TIMEOUT constant is set to the wrong value</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5399">HDFS-5399</a>.
+     Major improvement reported by Jing Zhao and fixed by Jing Zhao <br>
+     <b>Revisit SafeModeException and corresponding retry policies</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5393">HDFS-5393</a>.
+     Minor sub-task reported by Haohui Mai and fixed by Haohui Mai <br>
+     <b>Serve bootstrap and jQuery locally</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5382">HDFS-5382</a>.
+     Major sub-task reported by Haohui Mai and fixed by Haohui Mai <br>
+     <b>Implement the UI of browsing filesystems in HTML 5 page</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5379">HDFS-5379</a>.
+     Major sub-task reported by Haohui Mai and fixed by Haohui Mai <br>
+     <b>Update links to datanode information in dfshealth.html</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5375">HDFS-5375</a>.
+     Minor bug reported by Chris Nauroth and fixed by Chris Nauroth (tools)<br>
+     <b>hdfs.cmd does not expose several snapshot commands.</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5374">HDFS-5374</a>.
+     Trivial bug reported by Suresh Srinivas and fixed by Suresh Srinivas <br>
+     <b>Remove deadcode in DFSOutputStream</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5372">HDFS-5372</a>.
+     Major bug reported by Tsz Wo (Nicholas), SZE and fixed by Vinayakumar B (namenode)<br>
+     <b>In FSNamesystem, hasReadLock() returns false if the current thread holds the write lock</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5371">HDFS-5371</a>.
+     Minor improvement reported by Jing Zhao and fixed by Jing Zhao (ha , test)<br>
+     <b>Let client retry the same NN when "dfs.client.test.drop.namenode.response.number" is enabled</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5370">HDFS-5370</a>.
+     Trivial bug reported by Kousuke Saruta and fixed by Kousuke Saruta (hdfs-client)<br>
+     <b>Typo in Error Message:  different between range in condition and range in error message</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5365">HDFS-5365</a>.
+     Major bug reported by Radim Kolar and fixed by Radim Kolar (build , libhdfs)<br>
+     <b>Fix libhdfs compile error on FreeBSD9</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5364">HDFS-5364</a>.
+     Major sub-task reported by Brandon Li and fixed by Brandon Li (nfs)<br>
+     <b>Add OpenFileCtx cache</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5363">HDFS-5363</a>.
+     Major sub-task reported by Haohui Mai and fixed by Haohui Mai <br>
+     <b>Refactor WebHdfsFileSystem: move SPENGO-authenticated connection creation to URLConnectionFactory </b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5360">HDFS-5360</a>.
+     Minor improvement reported by Shinichi Yamashita and fixed by Shinichi Yamashita (snapshots)<br>
+     <b>Improvement of usage message of renameSnapshot and deleteSnapshot</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5353">HDFS-5353</a>.
+     Blocker bug reported by Haohui Mai and fixed by Colin Patrick McCabe <br>
+     <b>Short circuit reads fail when dfs.encrypt.data.transfer is enabled</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5352">HDFS-5352</a>.
+     Minor bug reported by Ted Yu and fixed by Ted Yu <br>
+     <b>Server#initLog() doesn't close InputStream in httpfs</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5350">HDFS-5350</a>.
+     Minor improvement reported by Rob Weltman and fixed by Jimmy Xiang (namenode)<br>
+     <b>Name Node should report fsimage transfer time as a metric</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5347">HDFS-5347</a>.
+     Major sub-task reported by Brandon Li and fixed by Brandon Li (documentation)<br>
+     <b>add HDFS NFS user guide</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5346">HDFS-5346</a>.
+     Major bug reported by Kihwal Lee and fixed by Ravi Prakash (namenode , performance)<br>
+     <b>Avoid unnecessary call to getNumLiveDataNodes() for each block during IBR processing</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5344">HDFS-5344</a>.
+     Minor improvement reported by sathish and fixed by sathish (snapshots , tools)<br>
+     <b>Make LsSnapshottableDir as Tool interface implementation</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5343">HDFS-5343</a>.
+     Major bug reported by sathish and fixed by sathish (hdfs-client)<br>
+     <b>When cat command is issued on snapshot files getting unexpected result</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5342">HDFS-5342</a>.
+     Major sub-task reported by Haohui Mai and fixed by Haohui Mai <br>
+     <b>Provide more information in the FSNamesystem JMX interfaces</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5341">HDFS-5341</a>.
+     Major bug reported by qus-jiawei and fixed by qus-jiawei (datanode)<br>
+     <b>Reduce fsdataset lock duration during directory scanning.</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5338">HDFS-5338</a>.
+     Major improvement reported by Tsz Wo (Nicholas), SZE and fixed by Tsz Wo (Nicholas), SZE (namenode)<br>
+     <b>Add a conf to disable hostname check in DN registration</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5337">HDFS-5337</a>.
+     Major sub-task reported by Brandon Li and fixed by Brandon Li (nfs)<br>
+     <b>should do hsync for a commit request even there is no pending writes</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5336">HDFS-5336</a>.
+     Minor bug reported by Akira AJISAKA and fixed by Akira AJISAKA (namenode)<br>
+     <b>DataNode should not output 'StartupProgress' metrics</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5335">HDFS-5335</a>.
+     Major bug reported by Arpit Gupta and fixed by Haohui Mai <br>
+     <b>DFSOutputStream#close() keeps throwing exceptions when it is called multiple times</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5334">HDFS-5334</a>.
+     Major sub-task reported by Haohui Mai and fixed by Haohui Mai <br>
+     <b>Implement dfshealth.jsp in HTML pages</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5331">HDFS-5331</a>.
+     Major improvement reported by Vinayakumar B and fixed by Vinayakumar B (snapshots)<br>
+     <b>make SnapshotDiff.java to a o.a.h.util.Tool interface implementation</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5330">HDFS-5330</a>.
+     Major sub-task reported by Brandon Li and fixed by Brandon Li (nfs)<br>
+     <b>fix readdir and readdirplus for large directories</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5329">HDFS-5329</a>.
+     Major bug reported by Brandon Li and fixed by Brandon Li (namenode , nfs)<br>
+     <b>Update FSNamesystem#getListing() to handle inode path in startAfter token</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5325">HDFS-5325</a>.
+     Major sub-task reported by Haohui Mai and fixed by Haohui Mai <br>
+     <b>Remove WebHdfsFileSystem#ConnRunner</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5323">HDFS-5323</a>.
+     Minor improvement reported by Colin Patrick McCabe and fixed by Colin Patrick McCabe (namenode)<br>
+     <b>Remove some deadcode in BlockManager</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5322">HDFS-5322</a>.
+     Major bug reported by Arpit Gupta and fixed by Jing Zhao (ha)<br>
+     <b>HDFS delegation token not found in cache errors seen on secure HA clusters</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5317">HDFS-5317</a>.
+     Critical sub-task reported by Suresh Srinivas and fixed by Haohui Mai <br>
+     <b>Go back to DFS Home link does not work on datanode webUI</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5316">HDFS-5316</a>.
+     Critical sub-task reported by Suresh Srinivas and fixed by Haohui Mai <br>
+     <b>Namenode ignores the default https port</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5312">HDFS-5312</a>.
+     Major sub-task reported by Haohui Mai and fixed by Haohui Mai <br>
+     <b>Generate HTTP / HTTPS URL in DFSUtil#getInfoServer() based on the configured http policy</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5307">HDFS-5307</a>.
+     Major sub-task reported by Haohui Mai and fixed by Haohui Mai <br>
+     <b>Support both HTTP and HTTPS in jsp pages</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5305">HDFS-5305</a>.
+     Major bug reported by Suresh Srinivas and fixed by Suresh Srinivas <br>
+     <b>Add https support in HDFS</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5297">HDFS-5297</a>.
+     Major bug reported by Akira AJISAKA and fixed by Akira AJISAKA (documentation)<br>
+     <b>Fix dead links in HDFS site documents</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5291">HDFS-5291</a>.
+     Critical bug reported by Arpit Gupta and fixed by Jing Zhao (ha)<br>
+     <b>Clients need to retry when Active NN is in SafeMode</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5288">HDFS-5288</a>.
+     Major sub-task reported by Haohui Mai and fixed by Haohui Mai (nfs)<br>
+     <b>Close idle connections in portmap</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5283">HDFS-5283</a>.
+     Critical bug reported by Vinayakumar B and fixed by Vinayakumar B (snapshots)<br>
+     <b>NN not coming out of startup safemode due to under construction blocks only inside snapshots also counted in safemode threshhold</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5281">HDFS-5281</a>.
+     Major sub-task reported by Brandon Li and fixed by Brandon Li (nfs)<br>
+     <b>COMMIT request should not block</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5276">HDFS-5276</a>.
+     Major bug reported by Chengxiang Li and fixed by Colin Patrick McCabe <br>
+     <b>FileSystem.Statistics got performance issue on multi-thread read/write.</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5267">HDFS-5267</a>.
+     Minor improvement reported by Junping Du and fixed by Junping Du <br>
+     <b>Remove volatile from LightWeightHashSet</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5260">HDFS-5260</a>.
+     Major new feature reported by Chris Nauroth and fixed by Chris Nauroth (hdfs-client , libhdfs)<br>
+     <b>Merge zero-copy memory-mapped HDFS client reads to trunk and branch-2.</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5257">HDFS-5257</a>.
+     Major bug reported by Vinayakumar B and fixed by Vinayakumar B (hdfs-client , namenode)<br>
+     <b>addBlock() retry should return LocatedBlock with locations else client will get AIOBE</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5252">HDFS-5252</a>.
+     Major sub-task reported by Brandon Li and fixed by Brandon Li (nfs)<br>
+     <b>Stable write is not handled correctly in someplace</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5240">HDFS-5240</a>.
+     Major sub-task reported by Daryn Sharp and fixed by Daryn Sharp (namenode)<br>
+     <b>Separate formatting from logging in the audit logger API</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5239">HDFS-5239</a>.
+     Major sub-task reported by Daryn Sharp and fixed by Daryn Sharp (namenode)<br>
+     <b>Allow FSNamesystem lock fairness to be configurable</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5220">HDFS-5220</a>.
+     Major improvement reported by Rob Weltman and fixed by Jimmy Xiang (namenode)<br>
+     <b>Expose group resolution time as metric</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5207">HDFS-5207</a>.
+     Major improvement reported by Junping Du and fixed by Junping Du (namenode)<br>
+     <b>In BlockPlacementPolicy, update 2 parameters of chooseTarget()</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5188">HDFS-5188</a>.
+     Major improvement reported by Tsz Wo (Nicholas), SZE and fixed by Tsz Wo (Nicholas), SZE (namenode)<br>
+     <b>Clean up BlockPlacementPolicy and its implementations</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5171">HDFS-5171</a>.
+     Major sub-task reported by Brandon Li and fixed by Haohui Mai (nfs)<br>
+     <b>NFS should create input stream for a file and try to share it with multiple read requests</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5170">HDFS-5170</a>.
+     Trivial bug reported by Andrew Wang and fixed by Andrew Wang <br>
+     <b>BlockPlacementPolicyDefault uses the wrong classname when alerting to enable debug logging</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5164">HDFS-5164</a>.
+     Minor bug reported by Colin Patrick McCabe and fixed by Colin Patrick McCabe (namenode)<br>
+     <b>deleteSnapshot should check if OperationCategory.WRITE is possible before taking write lock</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5144">HDFS-5144</a>.
+     Minor improvement reported by Akira AJISAKA and fixed by Akira AJISAKA (documentation)<br>
+     <b>Document time unit to NameNodeMetrics.java</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5136">HDFS-5136</a>.
+     Major sub-task reported by Brandon Li and fixed by Brandon Li (nfs)<br>
+     <b>MNT EXPORT should give the full group list which can mount the exports</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5130">HDFS-5130</a>.
+     Minor test reported by Binglin Chang and fixed by Binglin Chang (test)<br>
+     <b>Add test for snapshot related FsShell and DFSAdmin commands</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5122">HDFS-5122</a>.
+     Major bug reported by Arpit Gupta and fixed by Haohui Mai (ha , webhdfs)<br>
+     <b>Support failover and retry in WebHdfsFileSystem for NN HA</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5110">HDFS-5110</a>.
+     Major sub-task reported by Brandon Li and fixed by Brandon Li (nfs)<br>
+     <b>Change FSDataOutputStream to HdfsDataOutputStream for opened streams to fix type cast error</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5107">HDFS-5107</a>.
+     Major sub-task reported by Brandon Li and fixed by Brandon Li (nfs)<br>
+     <b>Fix array copy error in Readdir and Readdirplus responses</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5104">HDFS-5104</a>.
+     Major sub-task reported by Brandon Li and fixed by Brandon Li (nfs)<br>
+     <b>Support dotdot name in NFS LOOKUP operation</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5093">HDFS-5093</a>.
+     Minor bug reported by Chuan Liu and fixed by Chuan Liu (test)<br>
+     <b>TestGlobPaths should re-use the MiniDFSCluster to avoid failure on Windows</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5078">HDFS-5078</a>.
+     Major sub-task reported by Brandon Li and fixed by Brandon Li (nfs)<br>
+     <b>Support file append in NFSv3 gateway to enable data streaming to HDFS</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5075">HDFS-5075</a>.
+     Major bug reported by Timothy St. Clair and fixed by Timothy St. Clair <br>
+     <b>httpfs-config.sh calls out incorrect env script name</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5074">HDFS-5074</a>.
+     Major bug reported by Todd Lipcon and fixed by Todd Lipcon (ha , namenode)<br>
+     <b>Allow starting up from an fsimage checkpoint in the middle of a segment</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5073">HDFS-5073</a>.
+     Minor bug reported by Kihwal Lee and fixed by Arpit Agarwal (test)<br>
+     <b>TestListCorruptFileBlocks fails intermittently </b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5071">HDFS-5071</a>.
+     Major sub-task reported by Kihwal Lee and fixed by Brandon Li (nfs)<br>
+     <b>Change hdfs-nfs parent project to hadoop-project</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5069">HDFS-5069</a>.
+     Major sub-task reported by Brandon Li and fixed by Brandon Li (nfs)<br>
+     <b>Include hadoop-nfs and hadoop-hdfs-nfs into hadoop dist for NFS deployment</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5068">HDFS-5068</a>.
+     Major improvement reported by Konstantin Shvachko and fixed by Konstantin Shvachko (benchmarks)<br>
+     <b>Convert NNThroughputBenchmark to a Tool to allow generic options.</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5065">HDFS-5065</a>.
+     Major bug reported by Ivan Mitic and fixed by Ivan Mitic (hdfs-client , test)<br>
+     <b>TestSymlinkHdfsDisable fails on Windows</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5043">HDFS-5043</a>.
+     Major bug reported by Brandon Li and fixed by Brandon Li <br>
+     <b>For HdfsFileStatus, set default value of childrenNum to -1 instead of 0 to avoid confusing applications</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5037">HDFS-5037</a>.
+     Critical improvement reported by Todd Lipcon and fixed by Andrew Wang (ha , namenode)<br>
+     <b>Active NN should trigger its own edit log rolls</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5035">HDFS-5035</a>.
+     Major bug reported by Andrew Wang and fixed by Andrew Wang (namenode)<br>
+     <b>getFileLinkStatus and rename do not correctly check permissions of symlinks</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5034">HDFS-5034</a>.
+     Trivial improvement reported by Andrew Wang and fixed by Andrew Wang (namenode)<br>
+     <b>Remove debug prints from getFileLinkInfo</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5023">HDFS-5023</a>.
+     Major bug reported by Ravi Prakash and fixed by Mit Desai (snapshots , test)<br>
+     <b>TestSnapshotPathINodes.testAllowSnapshot is failing with jdk7</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5014">HDFS-5014</a>.
+     Major bug reported by Vinayakumar B and fixed by Vinayakumar B (datanode , ha)<br>
+     <b>BPOfferService#processCommandFromActor() synchronization on namenode RPC call delays IBR to Active NN, if Stanby NN is unstable</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-5004">HDFS-5004</a>.
+     Major improvement reported by Trevor Lorimer and fixed by Trevor Lorimer (namenode)<br>
+     <b>Add additional JMX bean for NameNode status data</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-4997">HDFS-4997</a>.
+     Major bug reported by Colin Patrick McCabe and fixed by Colin Patrick McCabe (libhdfs)<br>
+     <b>libhdfs doesn't return correct error codes in most cases</b><br>
+     <blockquote>libhdfs now returns correct codes in errno. Previously, due to a bug, many functions set errno to 255 instead of the more specific error code.</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-4995">HDFS-4995</a>.
+     Major bug reported by Kihwal Lee and fixed by Kihwal Lee (namenode)<br>
+     <b>Make getContentSummary() less expensive</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-4994">HDFS-4994</a>.
+     Minor bug reported by Kihwal Lee and fixed by Robert Parker (namenode)<br>
+     <b>Audit log getContentSummary() calls</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-4983">HDFS-4983</a>.
+     Major improvement reported by Harsh J and fixed by Yongjun Zhang (webhdfs)<br>
+     <b>Numeric usernames do not work with WebHDFS FS</b><br>
+     <blockquote>Add a new configuration property "dfs.webhdfs.user.provider.user.pattern" for specifying user name filters for WebHDFS.</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-4962">HDFS-4962</a>.
+     Minor sub-task reported by Tsz Wo (Nicholas), SZE and fixed by Tsz Wo (Nicholas), SZE (nfs)<br>
+     <b>Use enum for nfs constants</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-4949">HDFS-4949</a>.
+     Major new feature reported by Andrew Wang and fixed by Andrew Wang (datanode , namenode)<br>
+     <b>Centralized cache management in HDFS</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-4948">HDFS-4948</a>.
+     Major bug reported by Robert Joseph Evans and fixed by Brandon Li <br>
+     <b>mvn site for hadoop-hdfs-nfs fails</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-4947">HDFS-4947</a>.
+     Major sub-task reported by Brandon Li and fixed by Jing Zhao (nfs)<br>
+     <b>Add NFS server export table to control export by hostname or IP range</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-4885">HDFS-4885</a>.
+     Major sub-task reported by Junping Du and fixed by Junping Du <br>
+     <b>Update verifyBlockPlacement() API in BlockPlacementPolicy</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-4879">HDFS-4879</a>.
+     Major improvement reported by Todd Lipcon and fixed by Todd Lipcon (namenode)<br>
+     <b>Add "blocked ArrayList" collection to avoid CMS full GCs</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-4860">HDFS-4860</a>.
+     Major improvement reported by Trevor Lorimer and fixed by Trevor Lorimer (namenode)<br>
+     <b>Add additional attributes to JMX beans</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-4816">HDFS-4816</a>.
+     Major bug reported by Andrew Wang and fixed by Andrew Wang (namenode)<br>
+     <b>transitionToActive blocks if the SBN is doing checkpoint image transfer</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-4772">HDFS-4772</a>.
+     Minor improvement reported by Brandon Li and fixed by Brandon Li (namenode)<br>
+     <b>Add number of children in HdfsFileStatus</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-4763">HDFS-4763</a>.
+     Major sub-task reported by Brandon Li and fixed by Brandon Li (nfs)<br>
+     <b>Add script changes/utility for starting NFS gateway</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-4762">HDFS-4762</a>.
+     Major sub-task reported by Brandon Li and fixed by Brandon Li (nfs)<br>
+     <b>Provide HDFS based NFSv3 and Mountd implementation</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-4657">HDFS-4657</a>.
+     Major bug reported by Aaron T. Myers and fixed by Aaron T. Myers (namenode)<br>
+     <b>Limit the number of blocks logged by the NN after a block report to a configurable value.</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-4633">HDFS-4633</a>.
+     Major bug reported by Chris Nauroth and fixed by Chris Nauroth (hdfs-client , test)<br>
+     <b>TestDFSClientExcludedNodes fails sporadically if excluded nodes cache expires too quickly</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-4517">HDFS-4517</a>.
+     Major test reported by Vadim Bondarev and fixed by Ivan A. Veselovsky <br>
+     <b>Cover class RemoteBlockReader with unit tests</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-4516">HDFS-4516</a>.
+     Critical bug reported by Uma Maheswara Rao G and fixed by Vinayakumar B (namenode)<br>
+     <b>Client crash after block allocation and NN switch before lease recovery for the same file can cause readers to fail forever</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-4512">HDFS-4512</a>.
+     Major test reported by Vadim Bondarev and fixed by Vadim Bondarev <br>
+     <b>Cover package org.apache.hadoop.hdfs.server.common with tests</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-4511">HDFS-4511</a>.
+     Major test reported by Vadim Bondarev and fixed by Andrey Klochkov <br>
+     <b>Cover package org.apache.hadoop.hdfs.tools with unit test</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-4510">HDFS-4510</a>.
+     Major test reported by Vadim Bondarev and fixed by Andrey Klochkov <br>
+     <b>Cover classes ClusterJspHelper/NamenodeJspHelper with unit tests</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-4491">HDFS-4491</a>.
+     Major test reported by Tsuyoshi OZAWA and fixed by Andrey Klochkov (test)<br>
+     <b>Parallel testing HDFS</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-4376">HDFS-4376</a>.
+     Major bug reported by Aaron T. Myers and fixed by Junping Du (balancer)<br>
+     <b> Fix several race conditions in Balancer and resolve intermittent timeout of TestBalancerWithNodeGroup</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-4329">HDFS-4329</a>.
+     Major bug reported by Andy Isaacson and fixed by Cristina L. Abad (hdfs-client)<br>
+     <b>DFSShell issues with directories with spaces in name</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-4278">HDFS-4278</a>.
+     Major improvement reported by Harsh J and fixed by Kousuke Saruta (datanode , namenode)<br>
+     <b>Log an ERROR when DFS_BLOCK_ACCESS_TOKEN_ENABLE config  is disabled but security is turned on.</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-4201">HDFS-4201</a>.
+     Critical bug reported by Eli Collins and fixed by Jimmy Xiang (namenode)<br>
+     <b>NPE in BPServiceActor#sendHeartBeat</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-4096">HDFS-4096</a>.
+     Major sub-task reported by Jing Zhao and fixed by Haohui Mai (datanode , namenode)<br>
+     <b>Add snapshot information to namenode WebUI</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-3987">HDFS-3987</a>.
+     Major sub-task reported by Alejandro Abdelnur and fixed by Haohui Mai <br>
+     <b>Support webhdfs over HTTPS</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-3981">HDFS-3981</a>.
+     Major bug reported by Xiaobo Peng and fixed by Xiaobo Peng (namenode)<br>
+     <b>access time is set without holding FSNamesystem write lock</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-3934">HDFS-3934</a>.
+     Minor bug reported by Andy Isaacson and fixed by Colin Patrick McCabe <br>
+     <b>duplicative dfs_hosts entries handled wrong</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HDFS-2933">HDFS-2933</a>.
+     Major improvement reported by Philip Zeyliger and fixed by Vivek Ganesan (datanode)<br>
+     <b>Improve DataNode Web UI Index Page</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10317">HADOOP-10317</a>.
+     Major bug reported by Andrew Wang and fixed by Andrew Wang <br>
+     <b>Rename branch-2.3 release version from 2.4.0-SNAPSHOT to 2.3.0-SNAPSHOT</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10313">HADOOP-10313</a>.
+     Major bug reported by Alejandro Abdelnur and fixed by Alejandro Abdelnur (build)<br>
+     <b>Script and jenkins job to produce Hadoop release artifacts</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10311">HADOOP-10311</a>.
+     Blocker bug reported by Suresh Srinivas and fixed by Alejandro Abdelnur <br>
+     <b>Cleanup vendor names from the code base</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10310">HADOOP-10310</a>.
+     Blocker bug reported by Aaron T. Myers and fixed by Aaron T. Myers (security)<br>
+     <b>SaslRpcServer should be initialized even when no secret manager present</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10305">HADOOP-10305</a>.
+     Major bug reported by Akira AJISAKA and fixed by Akira AJISAKA (metrics)<br>
+     <b>Add "rpc.metrics.quantile.enable" and "rpc.metrics.percentiles.intervals" to core-default.xml</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10292">HADOOP-10292</a>.
+     Major bug reported by Haohui Mai and fixed by Haohui Mai <br>
+     <b>Restore HttpServer from branch-2.2 in branch-2</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10291">HADOOP-10291</a>.
+     Major bug reported by Mit Desai and fixed by Mit Desai <br>
+     <b>TestSecurityUtil#testSocketAddrWithIP fails</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10288">HADOOP-10288</a>.
+     Major bug reported by Todd Lipcon and fixed by Todd Lipcon (util)<br>
+     <b>Explicit reference to Log4JLogger breaks non-log4j users</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10274">HADOOP-10274</a>.
+     Minor improvement reported by takeshi.miao and fixed by takeshi.miao (security)<br>
+     <b>Lower the logging level from ERROR to WARN for UGI.doAs method</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10273">HADOOP-10273</a>.
+     Major bug reported by Arpit Agarwal and fixed by Arpit Agarwal (build)<br>
+     <b>Fix 'mvn site'</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10255">HADOOP-10255</a>.
+     Blocker bug reported by Haohui Mai and fixed by Haohui Mai <br>
+     <b>Rename HttpServer to HttpServer2 to retain older HttpServer in branch-2 for compatibility</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10252">HADOOP-10252</a>.
+     Major bug reported by Jimmy Xiang and fixed by Jimmy Xiang <br>
+     <b>HttpServer can't start if hostname is not specified</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10250">HADOOP-10250</a>.
+     Major bug reported by Yongjun Zhang and fixed by Yongjun Zhang <br>
+     <b>VersionUtil returns wrong value when comparing two versions</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10248">HADOOP-10248</a>.
+     Major improvement reported by Ted Yu and fixed by Akira AJISAKA <br>
+     <b>Property name should be included in the exception where property value is null</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10240">HADOOP-10240</a>.
+     Trivial bug reported by Chris Nauroth and fixed by Chris Nauroth (documentation)<br>
+     <b>Windows build instructions incorrectly state requirement of protoc 2.4.1 instead of 2.5.0</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10236">HADOOP-10236</a>.
+     Trivial bug reported by Akira AJISAKA and fixed by Akira AJISAKA <br>
+     <b>Fix typo in o.a.h.ipc.Client#checkResponse</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10235">HADOOP-10235</a>.
+     Major bug reported by Alejandro Abdelnur and fixed by Alejandro Abdelnur (build)<br>
+     <b>Hadoop tarball has 2 versions of stax-api JARs</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10234">HADOOP-10234</a>.
+     Major bug reported by Chris Nauroth and fixed by Chris Nauroth (scripts)<br>
+     <b>"hadoop.cmd jar" does not propagate exit code.</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10228">HADOOP-10228</a>.
+     Minor improvement reported by Haohui Mai and fixed by Haohui Mai (fs)<br>
+     <b>FsPermission#fromShort() should cache FsAction.values()</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10223">HADOOP-10223</a>.
+     Minor bug reported by Ted Yu and fixed by Ted Yu <br>
+     <b>MiniKdc#main() should close the FileReader it creates</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10214">HADOOP-10214</a>.
+     Major bug reported by Liang Xie and fixed by Liang Xie (ha)<br>
+     <b>Fix multithreaded correctness warnings in ActiveStandbyElector</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10212">HADOOP-10212</a>.
+     Major bug reported by Akira AJISAKA and fixed by Akira AJISAKA (documentation)<br>
+     <b>Incorrect compile command in Native Library document</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10208">HADOOP-10208</a>.
+     Trivial improvement reported by Benoy Antony and fixed by Benoy Antony <br>
+     <b>Remove duplicate initialization in StringUtils.getStringCollection</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10207">HADOOP-10207</a>.
+     Minor test reported by Jimmy Xiang and fixed by Jimmy Xiang <br>
+     <b>TestUserGroupInformation#testLogin is flaky</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10203">HADOOP-10203</a>.
+     Major bug reported by Andrei Savu and fixed by Andrei Savu (fs/s3)<br>
+     <b>Connection leak in Jets3tNativeFileSystemStore#retrieveMetadata </b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10198">HADOOP-10198</a>.
+     Minor improvement reported by Colin Patrick McCabe and fixed by Colin Patrick McCabe (native)<br>
+     <b>DomainSocket: add support for socketpair</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10193">HADOOP-10193</a>.
+     Minor bug reported by Gregory Chanan and fixed by Gregory Chanan (security)<br>
+     <b>hadoop-auth's PseudoAuthenticationHandler can consume getInputStream</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10178">HADOOP-10178</a>.
+     Major bug reported by shanyu zhao and fixed by shanyu zhao (conf)<br>
+     <b>Configuration deprecation always emit "deprecated" warnings when a new key is used</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10175">HADOOP-10175</a>.
+     Major bug reported by Chuan Liu and fixed by Chuan Liu (fs)<br>
+     <b>Har files system authority should preserve userinfo</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10173">HADOOP-10173</a>.
+     Critical improvement reported by Daryn Sharp and fixed by Daryn Sharp (ipc)<br>
+     <b>Remove UGI from DIGEST-MD5 SASL server creation</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10172">HADOOP-10172</a>.
+     Critical improvement reported by Daryn Sharp and fixed by Daryn Sharp (ipc)<br>
+     <b>Cache SASL server factories</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10171">HADOOP-10171</a>.
+     Major bug reported by Mit Desai and fixed by Mit Desai <br>
+     <b>TestRPC fails intermittently on jkd7</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10169">HADOOP-10169</a>.
+     Minor improvement reported by Liang Xie and fixed by Liang Xie (metrics)<br>
+     <b>remove the unnecessary  synchronized in JvmMetrics class</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10168">HADOOP-10168</a>.
+     Major bug reported by Thejas M Nair and fixed by Thejas M Nair <br>
+     <b>fix javadoc of ReflectionUtils.copy </b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10167">HADOOP-10167</a>.
+     Major improvement reported by Mikhail Antonov and fixed by  (build)<br>
+     <b>Mark hadoop-common source as UTF-8 in Maven pom files / refactoring</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10164">HADOOP-10164</a>.
+     Major improvement reported by Robert Joseph Evans and fixed by Robert Joseph Evans <br>
+     <b>Allow UGI to login with a known Subject</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10162">HADOOP-10162</a>.
+     Major bug reported by Mit Desai and fixed by Mit Desai <br>
+     <b>Fix symlink-related test failures in TestFileContextResolveAfs and TestStat in branch-2</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10147">HADOOP-10147</a>.
+     Minor bug reported by Eric Sirianni and fixed by Steve Loughran (build)<br>
+     <b>Upgrade to commons-logging 1.1.3 to avoid potential deadlock in MiniDFSCluster</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10146">HADOOP-10146</a>.
+     Critical bug reported by Daryn Sharp and fixed by Daryn Sharp (util)<br>
+     <b>Workaround JDK7 Process fd close bug</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10143">HADOOP-10143</a>.
+     Major improvement reported by Liang Xie and fixed by Liang Xie (io)<br>
+     <b>replace WritableFactories's hashmap with ConcurrentHashMap</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10142">HADOOP-10142</a>.
+     Major bug reported by Vinayakumar B and fixed by Vinayakumar B <br>
+     <b>Avoid groups lookup for unprivileged users such as "dr.who"</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10135">HADOOP-10135</a>.
+     Major bug reported by David Dobbins and fixed by David Dobbins (fs)<br>
+     <b>writes to swift fs over partition size leave temp files and empty output file</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10132">HADOOP-10132</a>.
+     Minor improvement reported by Ted Yu and fixed by Ted Yu <br>
+     <b>RPC#stopProxy() should log the class of proxy when IllegalArgumentException is encountered</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10130">HADOOP-10130</a>.
+     Minor bug reported by Binglin Chang and fixed by Binglin Chang <br>
+     <b>RawLocalFS::LocalFSFileInputStream.pread does not track FS::Statistics</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10129">HADOOP-10129</a>.
+     Critical bug reported by Daryn Sharp and fixed by Daryn Sharp (tools/distcp)<br>
+     <b>Distcp may succeed when it fails</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10127">HADOOP-10127</a>.
+     Major bug reported by Karthik Kambatla and fixed by Karthik Kambatla (ipc)<br>
+     <b>Add ipc.client.connect.retry.interval to control the frequency of connection retries</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10126">HADOOP-10126</a>.
+     Minor bug reported by Vinayakumar B and fixed by Vinayakumar B (util)<br>
+     <b>LightWeightGSet log message is confusing : "2.0% max memory = 2.0 GB"</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10125">HADOOP-10125</a>.
+     Major bug reported by Ming Ma and fixed by Ming Ma (ipc)<br>
+     <b>no need to process RPC request if the client connection has been dropped</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10112">HADOOP-10112</a>.
+     Major bug reported by Brandon Li and fixed by Brandon Li (tools)<br>
+     <b>har file listing  doesn't work with wild card</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10111">HADOOP-10111</a>.
+     Major improvement reported by Kihwal Lee and fixed by Kihwal Lee <br>
+     <b>Allow DU to be initialized with an initial value</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10110">HADOOP-10110</a>.
+     Blocker bug reported by Chuan Liu and fixed by Chuan Liu (build)<br>
+     <b>hadoop-auth has a build break due to missing dependency</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10109">HADOOP-10109</a>.
+     Major sub-task reported by Colin Patrick McCabe and fixed by Colin Patrick McCabe (test)<br>
+     <b>Fix test failure in TestOfflineEditsViewer introduced by HADOOP-10052</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10107">HADOOP-10107</a>.
+     Major sub-task reported by Tsz Wo (Nicholas), SZE and fixed by Kihwal Lee (ipc)<br>
+     <b>Server.getNumOpenConnections may throw NPE</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10106">HADOOP-10106</a>.
+     Minor bug reported by Ming Ma and fixed by Ming Ma <br>
+     <b>Incorrect thread name in RPC log messages</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10103">HADOOP-10103</a>.
+     Minor sub-task reported by Steve Loughran and fixed by Akira AJISAKA (build)<br>
+     <b>update commons-lang to 2.6</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10102">HADOOP-10102</a>.
+     Minor sub-task reported by Steve Loughran and fixed by Akira AJISAKA (build)<br>
+     <b>update commons IO from 2.1 to 2.4</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10100">HADOOP-10100</a>.
+     Major bug reported by Robert Kanter and fixed by Robert Kanter <br>
+     <b>MiniKDC shouldn't use apacheds-all artifact</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10095">HADOOP-10095</a>.
+     Minor improvement reported by Nicolas Liochon and fixed by Nicolas Liochon (io)<br>
+     <b>Performance improvement in CodecPool</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10094">HADOOP-10094</a>.
+     Trivial bug reported by Enis Soztutar and fixed by Enis Soztutar (util)<br>
+     <b>NPE in GenericOptionsParser#preProcessForWindows()</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10093">HADOOP-10093</a>.
+     Major bug reported by shanyu zhao and fixed by shanyu zhao (conf)<br>
+     <b>hadoop-env.cmd sets HADOOP_CLIENT_OPTS with a max heap size that is too small.</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10090">HADOOP-10090</a>.
+     Major bug reported by Ivan Mitic and fixed by Ivan Mitic (metrics)<br>
+     <b>Jobtracker metrics not updated properly after execution of a mapreduce job</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10088">HADOOP-10088</a>.
+     Major bug reported by Raja Aluri and fixed by Raja Aluri (build)<br>
+     <b>copy-nativedistlibs.sh needs to quote snappy lib dir</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10087">HADOOP-10087</a>.
+     Major bug reported by Yu Gao and fixed by Colin Patrick McCabe (security)<br>
+     <b>UserGroupInformation.getGroupNames() fails to return primary group first when JniBasedUnixGroupsMappingWithFallback is used</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10086">HADOOP-10086</a>.
+     Minor improvement reported by Masatake Iwasaki and fixed by Masatake Iwasaki (documentation)<br>
+     <b>User document for authentication in secure cluster</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10081">HADOOP-10081</a>.
+     Critical bug reported by Jason Lowe and fixed by Tsuyoshi OZAWA (ipc)<br>
+     <b>Client.setupIOStreams can leak socket resources on exception or error</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10079">HADOOP-10079</a>.
+     Major improvement reported by Colin Patrick McCabe and fixed by Colin Patrick McCabe <br>
+     <b>log a warning message if group resolution takes too long.</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10078">HADOOP-10078</a>.
+     Minor bug reported by Robert Kanter and fixed by Robert Kanter (security)<br>
+     <b>KerberosAuthenticator always does SPNEGO</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10072">HADOOP-10072</a>.
+     Trivial bug reported by Chris Nauroth and fixed by Chris Nauroth (nfs , test)<br>
+     <b>TestNfsExports#testMultiMatchers fails due to non-deterministic timing around cache expiry check.</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10067">HADOOP-10067</a>.
+     Minor improvement reported by Robert Rati and fixed by Robert Rati <br>
+     <b>Missing POM dependency on jsr305</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10064">HADOOP-10064</a>.
+     Major improvement reported by Arpit Agarwal and fixed by Arpit Agarwal (build)<br>
+     <b>Upgrade to maven antrun plugin version 1.7</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10058">HADOOP-10058</a>.
+     Minor bug reported by Akira AJISAKA and fixed by Chen He (metrics)<br>
+     <b>TestMetricsSystemImpl#testInitFirstVerifyStopInvokedImmediately fails on trunk</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10055">HADOOP-10055</a>.
+     Trivial bug reported by Eli Collins and fixed by Akira AJISAKA (documentation)<br>
+     <b>FileSystemShell.apt.vm doc has typo "numRepicas" </b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10052">HADOOP-10052</a>.
+     Major sub-task reported by Andrew Wang and fixed by Andrew Wang (fs)<br>
+     <b>Temporarily disable client-side symlink resolution</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10047">HADOOP-10047</a>.
+     Major new feature reported by Gopal V and fixed by Gopal V (io)<br>
+     <b>Add a directbuffer Decompressor API to hadoop</b><br>
+     <blockquote>Direct Bytebuffer decompressors for Zlib (Deflate &amp; Gzip) and Snappy </blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10046">HADOOP-10046</a>.
+     Trivial improvement reported by David S. Wang and fixed by David S. Wang <br>
+     <b>Print a log message when SSL is enabled</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10040">HADOOP-10040</a>.
+     Major bug reported by Yingda Chen and fixed by Chris Nauroth <br>
+     <b>hadoop.cmd in UNIX format and would not run by default on Windows</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10039">HADOOP-10039</a>.
+     Major bug reported by Suresh Srinivas and fixed by Haohui Mai (security)<br>
+     <b>Add Hive to the list of projects using AbstractDelegationTokenSecretManager</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10031">HADOOP-10031</a>.
+     Major bug reported by Chuan Liu and fixed by Chuan Liu (fs)<br>
+     <b>FsShell -get/copyToLocal/moveFromLocal should support Windows local path</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10030">HADOOP-10030</a>.
+     Major bug reported by Chuan Liu and fixed by Chuan Liu <br>
+     <b>FsShell -put/copyFromLocal should support Windows local path</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10029">HADOOP-10029</a>.
+     Major bug reported by Suresh Srinivas and fixed by Suresh Srinivas (fs)<br>
+     <b>Specifying har file to MR job fails in secure cluster</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10028">HADOOP-10028</a>.
+     Minor bug reported by Jing Zhao and fixed by Haohui Mai <br>
+     <b>Malformed ssl-server.xml.example </b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10006">HADOOP-10006</a>.
+     Blocker bug reported by Junping Du and fixed by Junping Du (fs , util)<br>
+     <b>Compilation failure in trunk for o.a.h.fs.swift.util.JSONUtil</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-10005">HADOOP-10005</a>.
+     Trivial improvement reported by Jackie Chang and fixed by Jackie Chang <br>
+     <b>No need to check INFO severity level is enabled or not</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-9998">HADOOP-9998</a>.
+     Major improvement reported by Junping Du and fixed by Junping Du (net)<br>
+     <b>Provide methods to clear only part of the DNSToSwitchMapping</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-9982">HADOOP-9982</a>.
+     Major bug reported by Akira AJISAKA and fixed by Akira AJISAKA (documentation)<br>
+     <b>Fix dead links in hadoop site docs</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-9981">HADOOP-9981</a>.
+     Critical bug reported by Kihwal Lee and fixed by Colin Patrick McCabe <br>
+     <b>globStatus should minimize its listStatus and getFileStatus calls</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-9964">HADOOP-9964</a>.
+     Major bug reported by Junping Du and fixed by Junping Du (util)<br>
+     <b>O.A.H.U.ReflectionUtils.printThreadInfo() is not thread-safe which cause TestHttpServer pending 10 minutes or longer.</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-9956">HADOOP-9956</a>.
+     Major sub-task reported by Daryn Sharp and fixed by Daryn Sharp (ipc)<br>
+     <b>RPC listener inefficiently assigns connections to readers</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-9955">HADOOP-9955</a>.
+     Major sub-task reported by Daryn Sharp and fixed by Daryn Sharp (ipc)<br>
+     <b>RPC idle connection closing is extremely inefficient</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-9929">HADOOP-9929</a>.
+     Major bug reported by Jason Lowe and fixed by Colin Patrick McCabe (fs)<br>
+     <b>Insufficient permissions for a path reported as file not found</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-9915">HADOOP-9915</a>.
+     Trivial improvement reported by Binglin Chang and fixed by Binglin Chang <br>
+     <b>o.a.h.fs.Stat support on Macosx</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-9909">HADOOP-9909</a>.
+     Major improvement reported by Shinichi Yamashita and fixed by  (fs)<br>
+     <b>org.apache.hadoop.fs.Stat should permit other LANG</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-9908">HADOOP-9908</a>.
+     Major bug reported by Todd Lipcon and fixed by Todd Lipcon (util)<br>
+     <b>Fix NPE when versioninfo properties file is missing</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-9898">HADOOP-9898</a>.
+     Minor bug reported by Todd Lipcon and fixed by Todd Lipcon (ipc , net)<br>
+     <b>Set SO_KEEPALIVE on all our sockets</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-9897">HADOOP-9897</a>.
+     Trivial improvement reported by Binglin Chang and fixed by Binglin Chang (fs)<br>
+     <b>Add method to get path start position without drive specifier in o.a.h.fs.Path  </b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-9889">HADOOP-9889</a>.
+     Major bug reported by Wei Yan and fixed by Wei Yan <br>
+     <b>Refresh the Krb5 configuration when creating a new kdc in Hadoop-MiniKDC</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-9887">HADOOP-9887</a>.
+     Major bug reported by Chris Nauroth and fixed by Chuan Liu (fs)<br>
+     <b>globStatus does not correctly handle paths starting with a drive spec on Windows</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-9875">HADOOP-9875</a>.
+     Minor bug reported by Aaron T. Myers and fixed by Aaron T. Myers (test)<br>
+     <b>TestDoAsEffectiveUser can fail on JDK 7</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-9871">HADOOP-9871</a>.
+     Minor bug reported by Luke Lu and fixed by Junping Du <br>
+     <b>Fix intermittent findbug warnings in DefaultMetricsSystem</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-9866">HADOOP-9866</a>.
+     Major test reported by Alejandro Abdelnur and fixed by Wei Yan (test)<br>
+     <b>convert hadoop-auth testcases requiring kerberos to use minikdc</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-9865">HADOOP-9865</a>.
+     Major bug reported by Chuan Liu and fixed by Chuan Liu <br>
+     <b>FileContext.globStatus() has a regression with respect to relative path</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-9860">HADOOP-9860</a>.
+     Major improvement reported by Wei Yan and fixed by Wei Yan <br>
+     <b>Remove class HackedKeytab and HackedKeytabEncoder from hadoop-minikdc once jira DIRSERVER-1882 solved</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-9848">HADOOP-9848</a>.
+     Major new feature reported by Wei Yan and fixed by Wei Yan (security , test)<br>
+     <b>Create a MiniKDC for use with security testing</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-9847">HADOOP-9847</a>.
+     Minor bug reported by Andrew Wang and fixed by Colin Patrick McCabe <br>
+     <b>TestGlobPath symlink tests fail to cleanup properly</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-9833">HADOOP-9833</a>.
+     Minor improvement reported by Steve Loughran and fixed by Kousuke Saruta (build)<br>
+     <b>move slf4j to version 1.7.5</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-9830">HADOOP-9830</a>.
+     Trivial bug reported by Dmitry Lysnichenko and fixed by Kousuke Saruta (documentation)<br>
+     <b>Typo at http://hadoop.apache.org/docs/current/</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-9820">HADOOP-9820</a>.
+     Blocker bug reported by Daryn Sharp and fixed by Daryn Sharp (ipc , security)<br>
+     <b>RPCv9 wire protocol is insufficient to support multiplexing</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-9817">HADOOP-9817</a>.
+     Major bug reported by Colin Patrick McCabe and fixed by Colin Patrick McCabe <br>
+     <b>FileSystem#globStatus and FileContext#globStatus need to work with symlinks</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-9806">HADOOP-9806</a>.
+     Major bug reported by Brandon Li and fixed by Brandon Li (nfs)<br>
+     <b>PortmapInterface should check if the procedure is out-of-range</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-9791">HADOOP-9791</a>.
+     Major bug reported by Ivan Mitic and fixed by Ivan Mitic <br>
+     <b>Add a test case covering long paths for new FileUtil access check methods</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-9787">HADOOP-9787</a>.
+     Major bug reported by Karthik Kambatla and fixed by Karthik Kambatla (util)<br>
+     <b>ShutdownHelper util to shutdown threads and threadpools</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-9784">HADOOP-9784</a>.
+     Major improvement reported by Junping Du and fixed by Junping Du <br>
+     <b>Add a builder for HttpServer</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-9748">HADOOP-9748</a>.
+     Critical sub-task reported by Daryn Sharp and fixed by Daryn Sharp (security)<br>
+     <b>Reduce blocking on UGI.ensureInitialized</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-9703">HADOOP-9703</a>.
+     Minor bug reported by Mark Miller and fixed by Tsuyoshi OZAWA <br>
+     <b>org.apache.hadoop.ipc.Client leaks threads on stop.</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-9698">HADOOP-9698</a>.
+     Blocker sub-task reported by Daryn Sharp and fixed by Daryn Sharp (ipc)<br>
+     <b>RPCv9 client must honor server's SASL negotiate response</b><br>
+     <blockquote>The RPC client now waits for the Server's SASL negotiate response before instantiating its SASL client.</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-9693">HADOOP-9693</a>.
+     Trivial improvement reported by Steve Loughran and fixed by  <br>
+     <b>Shell should add a probe for OSX</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-9686">HADOOP-9686</a>.
+     Major improvement reported by Jason Lowe and fixed by Jason Lowe (conf)<br>
+     <b>Easy access to final parameters in Configuration</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-9683">HADOOP-9683</a>.
+     Blocker sub-task reported by Luke Lu and fixed by Daryn Sharp (ipc)<br>
+     <b>Wrap IpcConnectionContext in RPC headers</b><br>
+     <blockquote>Connection context is now sent as a rpc header wrapped protobuf.</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-9660">HADOOP-9660</a>.
+     Major bug reported by Enis Soztutar and fixed by Enis Soztutar (scripts , util)<br>
+     <b>[WINDOWS] Powershell / cmd parses -Dkey=value from command line as [-Dkey, value] which breaks GenericsOptionParser</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-9652">HADOOP-9652</a>.
+     Major improvement reported by Colin Patrick McCabe and fixed by Andrew Wang <br>
+     <b>Allow RawLocalFs#getFileLinkStatus to fill in the link owner and mode if requested</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-9635">HADOOP-9635</a>.
+     Major bug reported by V. Karthik Kumar and fixed by  (native)<br>
+     <b>Fix Potential Stack Overflow in DomainSocket.c</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-9623">HADOOP-9623</a>.
+     Major improvement reported by Timothy St. Clair and fixed by Amandeep Khurana (fs/s3)<br>
+     <b>Update jets3t dependency to  0.9.0 </b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-9618">HADOOP-9618</a>.
+     Major new feature reported by Todd Lipcon and fixed by Todd Lipcon (util)<br>
+     <b>Add thread which detects JVM pauses</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-9611">HADOOP-9611</a>.
+     Major improvement reported by Timothy St. Clair and fixed by Timothy St. Clair (build)<br>
+     <b>mvn-rpmbuild against google-guice &gt; 3.0 yields missing cglib dependency</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-9598">HADOOP-9598</a>.
+     Major test reported by Aleksey Gorshkov and fixed by Andrey Klochkov <br>
+     <b>Improve code coverage of RMAdminCLI</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-9594">HADOOP-9594</a>.
+     Major improvement reported by Timothy St. Clair and fixed by Timothy St. Clair (build)<br>
+     <b>Update apache commons math dependency</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-9582">HADOOP-9582</a>.
+     Major bug reported by Ashwin Shankar and fixed by Ashwin Shankar (conf)<br>
+     <b>Non-existent file to "hadoop fs -conf" doesn't throw error</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-9527">HADOOP-9527</a>.
+     Major bug reported by Arpit Agarwal and fixed by Arpit Agarwal (fs , test)<br>
+     <b>Add symlink support to LocalFileSystem on Windows</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-9515">HADOOP-9515</a>.
+     Major new feature reported by Brandon Li and fixed by Brandon Li <br>
+     <b>Add general interface for NFS and Mount</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-9509">HADOOP-9509</a>.
+     Major new feature reported by Brandon Li and fixed by Brandon Li <br>
+     <b>Implement ONCRPC and XDR</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-9494">HADOOP-9494</a>.
+     Major improvement reported by Dennis Y and fixed by Andrey Klochkov <br>
+     <b>Excluded auto-generated and examples code from clover reports</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-9487">HADOOP-9487</a>.
+     Major improvement reported by Steve Loughran and fixed by  (conf)<br>
+     <b>Deprecation warnings in Configuration should go to their own log or otherwise be suppressible</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-9470">HADOOP-9470</a>.
+     Major improvement reported by Ivan A. Veselovsky and fixed by Ivan A. Veselovsky (test)<br>
+     <b>eliminate duplicate FQN tests in different Hadoop modules</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-9432">HADOOP-9432</a>.
+     Minor new feature reported by Steve Loughran and fixed by  (build , documentation)<br>
+     <b>Add support for markdown .md files in site documentation</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-9421">HADOOP-9421</a>.
+     Blocker sub-task reported by Sanjay Radia and fixed by Daryn Sharp <br>
+     <b>Convert SASL to use ProtoBuf and provide negotiation capabilities</b><br>
+     <blockquote>Raw SASL protocol now uses protobufs wrapped with RPC headers.
+The negotiation sequence incorporates the state of the exchange.
+The server now has the ability to advertise its supported auth types.</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-9420">HADOOP-9420</a>.
+     Major bug reported by Todd Lipcon and fixed by Liang Xie (ipc , metrics)<br>
+     <b>Add percentile or max metric for rpcQueueTime, processing time</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-9417">HADOOP-9417</a>.
+     Major sub-task reported by Andrew Wang and fixed by Andrew Wang (fs)<br>
+     <b>Support for symlink resolution in LocalFileSystem / RawLocalFileSystem</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-9350">HADOOP-9350</a>.
+     Minor bug reported by Steve Loughran and fixed by Robert Kanter (build)<br>
+     <b>Hadoop not building against Java7 on OSX </b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-9319">HADOOP-9319</a>.
+     Major improvement reported by Arpit Agarwal and fixed by Binglin Chang <br>
+     <b>Update bundled lz4 source to latest version</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-9291">HADOOP-9291</a>.
+     Major test reported by Ivan A. Veselovsky and fixed by Ivan A. Veselovsky <br>
+     <b>enhance unit-test coverage of package o.a.h.metrics2</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-9254">HADOOP-9254</a>.
+     Major test reported by Vadim Bondarev and fixed by Vadim Bondarev <br>
+     <b>Cover packages org.apache.hadoop.util.bloom, org.apache.hadoop.util.hash</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-9241">HADOOP-9241</a>.
+     Trivial improvement reported by Harsh J and fixed by Harsh J <br>
+     <b>DU refresh interval is not configurable</b><br>
+     <blockquote>The 'du' (disk usage command from Unix) script refresh monitor is now configurable in the same way as its 'df' counterpart, via the property 'fs.du.interval', the default of which is 10 minute (in ms).</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-9225">HADOOP-9225</a>.
+     Major test reported by Vadim Bondarev and fixed by Andrey Klochkov <br>
+     <b>Cover package org.apache.hadoop.compress.Snappy</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-9199">HADOOP-9199</a>.
+     Major test reported by Vadim Bondarev and fixed by Andrey Klochkov <br>
+     <b>Cover package org.apache.hadoop.io with unit tests</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-9114">HADOOP-9114</a>.
+     Minor bug reported by liuyang and fixed by sathish <br>
+     <b>After defined the dfs.checksum.type as the NULL, write file and hflush will through java.lang.ArrayIndexOutOfBoundsException</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-9078">HADOOP-9078</a>.
+     Major test reported by Ivan A. Veselovsky and fixed by Ivan A. Veselovsky <br>
+     <b>enhance unit-test coverage of class org.apache.hadoop.fs.FileContext</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-9063">HADOOP-9063</a>.
+     Minor test reported by Ivan A. Veselovsky and fixed by Ivan A. Veselovsky <br>
+     <b>enhance unit-test coverage of class org.apache.hadoop.fs.FileUtil</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-9016">HADOOP-9016</a>.
+     Minor bug reported by Ivan A. Veselovsky and fixed by Ivan A. Veselovsky <br>
+     <b>org.apache.hadoop.fs.HarFileSystem.HarFSDataInputStream.HarFsInputStream.skip(long) must never return negative value.</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-8814">HADOOP-8814</a>.
+     Minor improvement reported by Brandon Li and fixed by Brandon Li (conf , fs , fs/s3 , ha , io , metrics , performance , record , security , util)<br>
+     <b>Inefficient comparison with the empty string. Use isEmpty() instead</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-8753">HADOOP-8753</a>.
+     Minor bug reported by Nishan Shetty, Huawei and fixed by Benoy Antony <br>
+     <b>LocalDirAllocator throws "ArithmeticException: / by zero" when there is no available space on configured local dir</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-8704">HADOOP-8704</a>.
+     Major improvement reported by Thomas Graves and fixed by Jonathan Eagles <br>
+     <b>add request logging to jetty/httpserver</b><br>
+     <blockquote></blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-8545">HADOOP-8545</a>.
+     Major new feature reported by Tim Miller and fixed by Dmitry Mezhensky (fs)<br>
+     <b>Filesystem Implementation for OpenStack Swift</b><br>
+     <blockquote>Added file system implementation for OpenStack Swift.
+There are two implementation: block and native (similar to Amazon S3 integration).
+Data locality issue solved by patch in Swift, commit procedure to OpenStack is in progress.
+
+To use implementation add to core-site.xml following:
+...
+	&lt;property&gt;
+	        &lt;name&gt;fs.swift.impl&lt;/name&gt;
+	    	&lt;value&gt;com.mirantis.fs.SwiftFileSystem&lt;/value&gt;
+	&lt;/property&gt;
+	&lt;property&gt;
+	    	&lt;name&gt;fs.swift.block.impl&lt;/name&gt;
+	         &lt;value&gt;com.mirantis.fs.block.SwiftBlockFileSystem&lt;/value&gt;
+        &lt;/property&gt;
+...
+
+In MapReduce job specify following configs for OpenStack Keystone authentication:
+conf.set("swift.auth.url", "http://172.18.66.117:5000/v2.0/tokens");
+conf.set("swift.tenant", "superuser");
+conf.set("swift.username", "admin1");
+conf.set("swift.password", "password");
+conf.setInt("swift.http.port", 8080);
+conf.setInt("swift.https.port", 443);
+
+Additional information specified on github: https://github.com/DmitryMezhensky/Hadoop-and-Swift-integration</blockquote></li>
+<li> <a href="https://issues.apache.org/jira/browse/HADOOP-7344">HADOOP-7344</a>.
+     Major bug reported by Daryn Sharp and fixed by Colin Patrick McCabe (fs)<br>
+     <b>globStatus doesn't grok groupings with a slash</b><br>
+     <blockquote></blockquote></li>
+</ul>
+</body></html>
 <!--
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with

+ 22 - 6
hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/Globber.java

@@ -114,7 +114,8 @@ class Globber {
       if (fs != null) {
         scheme = fs.getUri().getScheme();
       } else {
-        scheme = fc.getDefaultFileSystem().getUri().getScheme();
+        scheme = fc.getFSofPath(fc.fixRelativePart(path)).
+                    getUri().getScheme();
       }
     }
     return scheme;
@@ -126,7 +127,8 @@ class Globber {
       if (fs != null) {
         authority = fs.getUri().getAuthority();
       } else {
-        authority = fc.getDefaultFileSystem().getUri().getAuthority();
+        authority = fc.getFSofPath(fc.fixRelativePart(path)).
+                      getUri().getAuthority();
       }
     }
     return authority ;
@@ -162,18 +164,26 @@ class Globber {
       // Starting out at the root of the filesystem, we try to match
       // filesystem entries against pattern components.
       ArrayList<FileStatus> candidates = new ArrayList<FileStatus>(1);
+      // To get the "real" FileStatus of root, we'd have to do an expensive
+      // RPC to the NameNode.  So we create a placeholder FileStatus which has
+      // the correct path, but defaults for the rest of the information.
+      // Later, if it turns out we actually want the FileStatus of root, we'll
+      // replace the placeholder with a real FileStatus obtained from the
+      // NameNode.
+      FileStatus rootPlaceholder;
       if (Path.WINDOWS && !components.isEmpty()
           && Path.isWindowsAbsolutePath(absPattern.toUri().getPath(), true)) {
         // On Windows the path could begin with a drive letter, e.g. /E:/foo.
         // We will skip matching the drive letter and start from listing the
         // root of the filesystem on that drive.
         String driveLetter = components.remove(0);
-        candidates.add(new FileStatus(0, true, 0, 0, 0, new Path(scheme,
-            authority, Path.SEPARATOR + driveLetter + Path.SEPARATOR)));
+        rootPlaceholder = new FileStatus(0, true, 0, 0, 0, new Path(scheme,
+            authority, Path.SEPARATOR + driveLetter + Path.SEPARATOR));
       } else {
-        candidates.add(new FileStatus(0, true, 0, 0, 0,
-            new Path(scheme, authority, Path.SEPARATOR)));
+        rootPlaceholder = new FileStatus(0, true, 0, 0, 0,
+            new Path(scheme, authority, Path.SEPARATOR));
       }
+      candidates.add(rootPlaceholder);
       
       for (int componentIdx = 0; componentIdx < components.size();
           componentIdx++) {
@@ -245,6 +255,12 @@ class Globber {
         candidates = newCandidates;
       }
       for (FileStatus status : candidates) {
+        // Use object equality to see if this status is the root placeholder.
+        // See the explanation for rootPlaceholder above for more information.
+        if (status == rootPlaceholder) {
+          status = getFileStatus(rootPlaceholder.getPath());
+          if (status == null) continue;
+        }
         // HADOOP-3497 semantics: the user-defined filter is applied at the
         // end, once the full path is built up.
         if (filter.accept(status.getPath())) {

+ 6 - 0
hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/s3/S3FileSystem.java

@@ -443,6 +443,12 @@ public class S3FileSystem extends FileSystem {
     return getConf().getLong("fs.s3.block.size", 64 * 1024 * 1024);
   }
 
+  @Override
+  public String getCanonicalServiceName() {
+    // Does not support Token
+    return null;
+  }
+
   // diagnostic methods
 
   void dump() throws IOException {

+ 6 - 0
hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/s3native/NativeS3FileSystem.java

@@ -733,4 +733,10 @@ public class NativeS3FileSystem extends FileSystem {
   public Path getWorkingDirectory() {
     return workingDir;
   }
+
+  @Override
+  public String getCanonicalServiceName() {
+    // Does not support Token
+    return null;
+  }
 }

+ 6 - 2
hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/retry/LossyRetryInvocationHandler.java

@@ -51,11 +51,15 @@ public class LossyRetryInvocationHandler<T> extends RetryInvocationHandler<T> {
     int retryCount = RetryCount.get();
     if (retryCount < this.numToDrop) {
       RetryCount.set(++retryCount);
-      LOG.info("Drop the response. Current retryCount == " + retryCount);
+      if (LOG.isDebugEnabled()) {
+        LOG.debug("Drop the response. Current retryCount == " + retryCount);
+      }
       throw new RetriableException("Fake Exception");
     } else {
-      LOG.info("retryCount == " + retryCount
+      if (LOG.isDebugEnabled()) {
+        LOG.debug("retryCount == " + retryCount
           + ". It's time to normally process the response");
+      }
       return result;
     }
   }

+ 1 - 1
hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/Client.java

@@ -220,7 +220,7 @@ public class Client {
    * @param conf Configuration
    * @return the ping interval
    */
-  final static int getPingInterval(Configuration conf) {
+  final public static int getPingInterval(Configuration conf) {
     return conf.getInt(CommonConfigurationKeys.IPC_PING_INTERVAL_KEY,
         CommonConfigurationKeys.IPC_PING_INTERVAL_DEFAULT);
   }

+ 4 - 2
hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/Server.java

@@ -66,6 +66,7 @@ import javax.security.sasl.SaslServer;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceAudience.Private;
 import org.apache.hadoop.classification.InterfaceStability;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.conf.Configuration.IntegerRanges;
@@ -454,9 +455,10 @@ public abstract class Server {
    * Refresh the service authorization ACL for the service handled by this server
    * using the specified Configuration.
    */
-  public void refreshServiceAclWithConfigration(Configuration conf,
+  @Private
+  public void refreshServiceAclWithLoadedConfiguration(Configuration conf,
       PolicyProvider provider) {
-    serviceAuthorizationManager.refreshWithConfiguration(conf, provider);
+    serviceAuthorizationManager.refreshWithLoadedConfiguration(conf, provider);
   }
   /**
    * Returns a handle to the serviceAuthorizationManager (required in tests)

+ 7 - 4
hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/net/unix/DomainSocketWatcher.java

@@ -37,6 +37,7 @@ import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.util.NativeCodeLoader;
 
+import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.Preconditions;
 import com.google.common.util.concurrent.Uninterruptibles;
 
@@ -48,7 +49,7 @@ import com.google.common.util.concurrent.Uninterruptibles;
  * See {@link DomainSocket} for more information about UNIX domain sockets.
  */
 @InterfaceAudience.LimitedPrivate("HDFS")
-public final class DomainSocketWatcher extends Thread implements Closeable {
+public final class DomainSocketWatcher implements Closeable {
   static {
     if (SystemUtils.IS_OS_WINDOWS) {
       loadingFailureReason = "UNIX Domain sockets are not available on Windows.";
@@ -281,7 +282,7 @@ public final class DomainSocketWatcher extends Thread implements Closeable {
         try {
           processedCond.await();
         } catch (InterruptedException e) {
-          this.interrupt();
+          Thread.currentThread().interrupt();
         }
         if (!toAdd.contains(entry)) {
           break;
@@ -308,7 +309,7 @@ public final class DomainSocketWatcher extends Thread implements Closeable {
         try {
           processedCond.await();
         } catch (InterruptedException e) {
-          this.interrupt();
+          Thread.currentThread().interrupt();
         }
         if (!toRemove.containsKey(sock.fd)) {
           break;
@@ -381,7 +382,8 @@ public final class DomainSocketWatcher extends Thread implements Closeable {
     }
   }
 
-  private final Thread watcherThread = new Thread(new Runnable() {
+  @VisibleForTesting
+  final Thread watcherThread = new Thread(new Runnable() {
     @Override
     public void run() {
       LOG.info(this + ": starting with interruptCheckPeriodMs = " +
@@ -443,6 +445,7 @@ public final class DomainSocketWatcher extends Thread implements Closeable {
       } catch (IOException e) {
         LOG.error(toString() + " terminating on IOException", e);
       } finally {
+        kick(); // allow the handler for notificationSockets[0] to read a byte
         for (Entry entry : entries.values()) {
           sendCallback("close", entries, fdSet, entry.getDomainSocket().fd);
         }

+ 15 - 0
hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/Groups.java

@@ -27,6 +27,7 @@ import java.util.concurrent.ConcurrentHashMap;
 
 import org.apache.hadoop.HadoopIllegalArgumentException;
 import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceAudience.Private;
 import org.apache.hadoop.classification.InterfaceStability;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.CommonConfigurationKeys;
@@ -240,4 +241,18 @@ public class Groups {
     }
     return GROUPS;
   }
+
+  /**
+   * Create new groups used to map user-to-groups with loaded configuration.
+   * @param conf
+   * @return the groups being used to map user-to-groups.
+   */
+  @Private
+  public static synchronized Groups
+      getUserToGroupsMappingServiceWithLoadedConfiguration(
+          Configuration conf) {
+
+    GROUPS = new Groups(conf);
+    return GROUPS;
+  }
 }

+ 1 - 1
hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/LdapGroupsMapping.java

@@ -356,7 +356,7 @@ public class LdapGroupsMapping
         c = reader.read();
       }
       reader.close();
-      return password.toString();
+      return password.toString().trim();
     } catch (IOException ioe) {
       throw new RuntimeException("Could not read password file: " + pwFile, ioe);
     }

+ 4 - 2
hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/authorize/ServiceAuthorizationManager.java

@@ -26,6 +26,7 @@ import java.util.Set;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceAudience.Private;
 import org.apache.hadoop.classification.InterfaceStability;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.CommonConfigurationKeys;
@@ -122,10 +123,11 @@ public class ServiceAuthorizationManager {
     // Make a copy of the original config, and load the policy file
     Configuration policyConf = new Configuration(conf);
     policyConf.addResource(policyFile);
-    refreshWithConfiguration(policyConf, provider);
+    refreshWithLoadedConfiguration(policyConf, provider);
   }
 
-  public synchronized void refreshWithConfiguration(Configuration conf,
+  @Private
+  public synchronized void refreshWithLoadedConfiguration(Configuration conf,
       PolicyProvider provider) {
     final Map<Class<?>, AccessControlList> newAcls =
         new IdentityHashMap<Class<?>, AccessControlList>();

+ 53 - 0
hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/Waitable.java

@@ -0,0 +1,53 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.util;
+
+import java.util.concurrent.locks.Condition;
+
+/**
+ * Represents an object that you can wait for.
+ */
+public class Waitable<T> {
+  private T val;
+  private final Condition cond;
+
+  public Waitable(Condition cond) {
+    this.val = null;
+    this.cond = cond;
+  }
+
+  public T await() throws InterruptedException {
+    while (this.val == null) {
+      this.cond.await();
+    }
+    return this.val;
+  }
+
+  public void provide(T val) {
+    this.val = val;
+    this.cond.signalAll();
+  }
+
+  public boolean hasVal() {
+    return this.val != null;
+  }
+
+  public T getVal() {
+    return this.val;
+  }
+}

+ 1 - 1
hadoop-common-project/hadoop-common/src/main/java/overview.html

@@ -57,7 +57,7 @@ that process vast amounts of data. Here's what makes Hadoop especially useful:</
 
 <ul>
   <li>
-    Hadoop was been demonstrated on GNU/Linux clusters with 2000 nodes.
+    Hadoop has been demonstrated on GNU/Linux clusters with more than 4000 nodes.
   </li>
   <li>
     Windows is also a supported platform.

+ 6 - 1
hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/s3/S3FileSystemContractBaseTest.java

@@ -54,5 +54,10 @@ public abstract class S3FileSystemContractBaseTest
     assertEquals("Double default block size", newBlockSize,
 	fs.getFileStatus(file).getBlockSize());
   }
-  
+
+  public void testCanonicalName() throws Exception {
+    assertNull("s3 doesn't support security token and shouldn't have canonical name",
+               fs.getCanonicalServiceName());
+  }
+
 }

+ 7 - 2
hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/s3native/NativeS3FileSystemContractBaseTest.java

@@ -48,7 +48,12 @@ public abstract class NativeS3FileSystemContractBaseTest
     store.purge("test");
     super.tearDown();
   }
-  
+
+  public void testCanonicalName() throws Exception {
+    assertNull("s3n doesn't support security token and shouldn't have canonical name",
+               fs.getCanonicalServiceName());
+  }
+
   public void testListStatusForRoot() throws Exception {
     FileStatus[] paths = fs.listStatus(path("/"));
     assertEquals("Root directory is not empty; ", 0, paths.length);
@@ -60,7 +65,7 @@ public abstract class NativeS3FileSystemContractBaseTest
     assertEquals(1, paths.length);
     assertEquals(path("/test"), paths[0].getPath());
   }
-  
+
   public void testNoTrailingBackslashOnBucket() throws Exception {
     assertTrue(fs.getFileStatus(new Path(fs.getUri().toString())).isDirectory());
   }

+ 4 - 3
hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/net/unix/TestDomainSocketWatcher.java

@@ -73,9 +73,10 @@ public class TestDomainSocketWatcher {
    */
   @Test(timeout=60000)
   public void testInterruption() throws Exception {
-    DomainSocketWatcher watcher = new DomainSocketWatcher(10);
-    watcher.interrupt();
-    Uninterruptibles.joinUninterruptibly(watcher);
+    final DomainSocketWatcher watcher = new DomainSocketWatcher(10);
+    watcher.watcherThread.interrupt();
+    Uninterruptibles.joinUninterruptibly(watcher.watcherThread);
+    watcher.close();
   }
   
   @Test(timeout=300000)

+ 4 - 2
hadoop-hdfs-project/hadoop-hdfs-nfs/src/main/java/org/apache/hadoop/hdfs/nfs/nfs3/RpcProgramNfs3.java

@@ -545,7 +545,8 @@ public class RpcProgramNfs3 extends RpcProgram implements Nfs3Interface {
         return new READLINK3Response(Nfs3Status.NFS3ERR_SERVERFAULT);
       }
       if (MAX_READ_TRANSFER_SIZE < target.getBytes().length) {
-        return new READLINK3Response(Nfs3Status.NFS3ERR_IO, postOpAttr, null);
+        return new READLINK3Response(Nfs3Status.NFS3ERR_IO, postOpAttr,
+            new byte[0]);
       }
 
       return new READLINK3Response(Nfs3Status.NFS3_OK, postOpAttr,
@@ -1828,7 +1829,8 @@ public class RpcProgramNfs3 extends RpcProgram implements Nfs3Interface {
       } catch (IOException e1) {
         LOG.info("Can't get postOpAttr for fileId: " + handle.getFileId());
       }
-      WccData fileWcc = new WccData(Nfs3Utils.getWccAttr(preOpAttr), postOpAttr);
+      WccData fileWcc = new WccData(preOpAttr == null ? null
+          : Nfs3Utils.getWccAttr(preOpAttr), postOpAttr);
       return new COMMIT3Response(Nfs3Status.NFS3ERR_IO, fileWcc,
           Nfs3Constant.WRITE_COMMIT_VERF);
     }

+ 144 - 28
hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt

@@ -120,31 +120,6 @@ Trunk (Unreleased)
     HDFS-5041. Add the time of last heartbeat to dead server Web UI (Shinichi
     Yamashita via brandonli)
 
-    HDFS-5531. Combine the getNsQuota() and getDsQuota() methods in INode.
-    (szetszwo)
-
-    HDFS-5285. Flatten INodeFile hierarchy: Replace INodeFileUnderConstruction
-    and INodeFileUnderConstructionWithSnapshot with FileUnderContructionFeature.
-    (jing9 via szetszwo)
-
-    HDFS-5286. Flatten INodeDirectory hierarchy: Replace INodeDirectoryWithQuota
-    with DirectoryWithQuotaFeature.  (szetszwo)
-
-    HDFS-5537. Remove FileWithSnapshot interface.  (jing9 via szetszwo)
-
-    HDFS-5554. Flatten INodeFile hierarchy: Replace INodeFileWithSnapshot with
-    FileWithSnapshotFeature.  (jing9 via szetszwo)
-
-    HDFS-5647. Merge INodeDirectory.Feature and INodeFile.Feature. (Haohui Mai
-    via jing9)
-
-    HDFS-5632. Flatten INodeDirectory hierarchy: Replace
-    INodeDirectoryWithSnapshot with DirectoryWithSnapshotFeature.
-    (jing9 via szetszwo)
-
-    HDFS-5715. Use Snapshot ID to indicate the corresponding Snapshot for a
-    FileDiff/DirectoryDiff. (jing9)
-
     HDFS-5721. sharedEditsImage in Namenode#initializeSharedEdits() should be 
     closed before method returns. (Ted Yu via junping_du)
 
@@ -275,8 +250,6 @@ Trunk (Unreleased)
     HDFS-5719. FSImage#doRollback() should close prevState before return
     (Ted Yu via brandonli)
 
-    HDFS-5726. Fix compilation error in AbstractINodeDiff for JDK7. (jing9)
-
     HDFS-5768. Consolidate the serialization code in DelegationTokenSecretManager 
     (Haohui Mai via brandonli)
 
@@ -286,6 +259,60 @@ Trunk (Unreleased)
     HDFS-5794. Fix the inconsistency of layout version number of 
     ADD_DATANODE_AND_STORAGE_UUIDS between trunk and branch-2. (jing9)
 
+  BREAKDOWN OF HDFS-5698 SUBTASKS AND RELATED JIRAS
+
+    HDFS-5717. Save FSImage header in protobuf. (Haohui Mai via jing9)
+
+    HDFS-5738. Serialize INode information in protobuf. (Haohui Mai via jing9)
+
+    HDFS-5772. Serialize under-construction file information in FSImage. (jing9)
+
+    HDFS-5783. Compute the digest before loading FSImage. (Haohui Mai via jing9)
+
+    HDFS-5785. Serialize symlink in protobuf. (Haohui Mai via jing9)
+
+    HDFS-5793. Optimize the serialization of PermissionStatus. (Haohui Mai via
+    jing9)
+
+    HDFS-5743. Use protobuf to serialize snapshot information. (jing9)
+
+    HDFS-5774. Serialize CachePool directives in protobuf. (Haohui Mai via jing9)
+
+    HDFS-5744. Serialize information for token managers in protobuf. (Haohui Mai
+    via jing9)
+
+    HDFS-5824. Add a Type field in Snapshot DiffEntry's protobuf definition.
+    (jing9)
+
+    HDFS-5808. Implement cancellation when saving FSImage. (Haohui Mai via jing9)
+
+    HDFS-5826. Update the stored edit logs to be consistent with the changes in
+    HDFS-5698 branch. (Haohui Mai via jing9)
+
+    HDFS-5797. Implement offline image viewer. (Haohui Mai via jing9)
+
+    HDFS-5771. Track progress when loading fsimage. (Haohui Mai via cnauroth)
+
+    HDFS-5871. Use PBHelper to serialize CacheDirectiveInfoExpirationProto.
+    (Haohui Mai via jing9)
+
+    HDFS-5884. LoadDelegator should use IOUtils.readFully() to read the magic
+    header. (Haohui Mai via jing9)
+
+    HDFS-5885. Add annotation for repeated fields in the protobuf definition.
+    (Haohui Mai via jing9)
+
+    HDFS-5906. Fixing findbugs and javadoc warnings in the HDFS-5698 branch.
+    (Haohui Mai via jing9)
+
+    HDFS-5911. The id of a CacheDirective instance does not get serialized in 
+    the protobuf-fsimage. (Haohui Mai via jing9)
+
+    HDFS-5915. Refactor FSImageFormatProtobuf to simplify cross section reads.
+    (Haohui Mai via cnauroth)
+
+    HDFS-5847. Consolidate INodeReference into a separate section. (jing9)
+
 Release 2.4.0 - UNRELEASED
 
   INCOMPATIBLE CHANGES
@@ -311,6 +338,44 @@ Release 2.4.0 - UNRELEASED
     HDFS-4911.  Reduce PeerCache timeout to be commensurate with
     dfs.datanode.socket.reuse.keepalive (cmccabe)
 
+    HDFS-4370. Fix typo Blanacer in DataNode. (Chu Tong via shv)
+
+    HDFS-5929. Add blockpool % usage to HDFS federated nn page.
+    (Siqi Li via suresh)
+
+    HDFS-5810. Unify mmap cache and short-circuit file descriptor cache
+    (cmccabe)
+
+    HDFS-5940. Minor cleanups to ShortCircuitReplica, FsDatasetCache, and
+    DomainSocketWatcher (cmccabe)
+
+    HDFS-5531. Combine the getNsQuota() and getDsQuota() methods in INode.
+    (szetszwo)
+
+    HDFS-5285. Flatten INodeFile hierarchy: Replace INodeFileUnderConstruction
+    and INodeFileUnderConstructionWithSnapshot with FileUnderContructionFeature.
+    (jing9 via szetszwo)
+
+    HDFS-5286. Flatten INodeDirectory hierarchy: Replace INodeDirectoryWithQuota
+    with DirectoryWithQuotaFeature.  (szetszwo)
+
+    HDFS-5537. Remove FileWithSnapshot interface.  (jing9 via szetszwo)
+
+    HDFS-5554. Flatten INodeFile hierarchy: Replace INodeFileWithSnapshot with
+    FileWithSnapshotFeature.  (jing9 via szetszwo)
+
+    HDFS-5647. Merge INodeDirectory.Feature and INodeFile.Feature. (Haohui Mai
+    via jing9)
+
+    HDFS-5632. Flatten INodeDirectory hierarchy: Replace
+    INodeDirectoryWithSnapshot with DirectoryWithSnapshotFeature.
+    (jing9 via szetszwo)
+
+    HDFS-5715. Use Snapshot ID to indicate the corresponding Snapshot for a
+    FileDiff/DirectoryDiff. (jing9)
+
+    HDFS-5726. Fix compilation error in AbstractINodeDiff for JDK7. (jing9)
+
   OPTIMIZATIONS
 
     HDFS-5790. LeaseManager.findPath is very slow when many leases need recovery
@@ -353,7 +418,52 @@ Release 2.4.0 - UNRELEASED
     HDFS-5900. Cannot set cache pool limit of "unlimited" via CacheAdmin.
     (wang)
 
-Release 2.3.0 - UNRELEASED
+    HDFS-5886. Potential null pointer deference in RpcProgramNfs3#readlink()
+    (brandonli)
+
+    HDFS-4858. HDFS DataNode to NameNode RPC should timeout.
+    (Henry Wang via shv)
+
+    HDFS-5879. Some TestHftpFileSystem tests do not close streams.
+    (Gera Shegalov via suresh)
+
+    HDFS-5938. Make BlockReaderFactory#BlockReaderPeer a static class to avoid
+    a findbugs warning. (cmccabe)
+
+    HDFS-5891. webhdfs should not try connecting the DN during redirection
+    (Haohui Mai via brandonli)
+
+    HDFS-5904. TestFileStatus fails intermittently. (Mit Desai via kihwal)
+
+    HDFS-5941. add dfs.namenode.secondary.https-address and
+    dfs.namenode.secondary.https-address in hdfs-default.xml.
+    (Haohui Mai via cnauroth)
+
+    HDFS-5913. Nfs3Utils#getWccAttr() should check attr parameter against null
+    (brandonli)
+
+    HDFS-5934. New Namenode UI back button doesn't work as expected
+    (Travis Thompson via brandonli)
+
+    HDFS-5901. NameNode new UI doesn't support IE8 and IE9 on windows 7
+    (Vinayakumar B via brandonli)
+
+    HDFS-5943. 'dfs.namenode.https-address' property is not loaded from
+    configuration in federation setup. (suresh)
+
+Release 2.3.1 - UNRELEASED
+
+  INCOMPATIBLE CHANGES
+
+  NEW FEATURES
+
+  IMPROVEMENTS
+
+  OPTIMIZATIONS
+
+  BUG FIXES 
+
+Release 2.3.0 - 2014-02-18
 
   INCOMPATIBLE CHANGES
 
@@ -891,6 +1001,12 @@ Release 2.3.0 - UNRELEASED
     HDFS-5873. dfs.http.policy should have higher precedence over dfs.https.enable.
     (Haohui Mai via jing9)
 
+    HDFS-5837. dfs.namenode.replication.considerLoad should consider
+    decommissioned nodes. (Tao Luo via shv)
+
+    HDFS-5921. Cannot browse file system via NN web UI if any directory has
+    the sticky bit set. (atm)
+
   BREAKDOWN OF HDFS-2832 SUBTASKS AND RELATED JIRAS
 
     HDFS-4985. Add storage type to the protocol and expose it in block report

+ 3 - 0
hadoop-hdfs-project/hadoop-hdfs/dev-support/findbugsExcludeFile.xml

@@ -8,6 +8,9 @@
      <Match>
        <Package name="org.apache.hadoop.hdfs.server.namenode.ha.proto" />
      </Match>
+     <Match>
+       <Class name="~org.apache.hadoop.hdfs.server.namenode.FsImageProto.*" />
+     </Match>
      <Match>
        <Package name="org.apache.hadoop.hdfs.qjournal.protocol" />
      </Match>

+ 1 - 0
hadoop-hdfs-project/hadoop-hdfs/pom.xml

@@ -458,6 +458,7 @@ http://maven.apache.org/xsd/maven-4.0.0.xsd">
                 <includes>
                   <include>ClientDatanodeProtocol.proto</include>
                   <include>DatanodeProtocol.proto</include>
+                  <include>fsimage.proto</include>
                 </includes>
               </source>
               <output>${project.build.directory}/generated-sources/java</output>

+ 1 - 1
hadoop-hdfs-project/hadoop-hdfs/src/main/bin/hdfs

@@ -139,7 +139,7 @@ elif [ "$COMMAND" = "balancer" ] ; then
 elif [ "$COMMAND" = "jmxget" ] ; then
   CLASS=org.apache.hadoop.hdfs.tools.JMXGet
 elif [ "$COMMAND" = "oiv" ] ; then
-  CLASS=org.apache.hadoop.hdfs.tools.offlineImageViewer.OfflineImageViewer
+  CLASS=org.apache.hadoop.hdfs.tools.offlineImageViewer.OfflineImageViewerPB
 elif [ "$COMMAND" = "oev" ] ; then
   CLASS=org.apache.hadoop.hdfs.tools.offlineEditsViewer.OfflineEditsViewer
 elif [ "$COMMAND" = "fetchdt" ] ; then

+ 1 - 3
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/BlockReader.java

@@ -23,7 +23,6 @@ import java.util.EnumSet;
 import org.apache.hadoop.fs.ByteBufferReadable;
 import org.apache.hadoop.fs.ReadOption;
 import org.apache.hadoop.hdfs.client.ClientMmap;
-import org.apache.hadoop.hdfs.client.ClientMmapManager;
 import org.apache.hadoop.hdfs.protocol.LocatedBlock;
 
 /**
@@ -97,6 +96,5 @@ public interface BlockReader extends ByteBufferReadable {
    * @return              The ClientMmap object, or null if mmap is not
    *                      supported.
    */
-  ClientMmap getClientMmap(EnumSet<ReadOption> opts,
-        ClientMmapManager mmapManager);
+  ClientMmap getClientMmap(EnumSet<ReadOption> opts);
 }

+ 676 - 164
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/BlockReaderFactory.java

@@ -24,215 +24,746 @@ import java.io.FileInputStream;
 import java.io.IOException;
 import java.net.InetSocketAddress;
 
+import org.apache.commons.logging.LogFactory;
+import org.apache.commons.logging.Log;
 import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hdfs.client.ShortCircuitCache;
+import org.apache.hadoop.hdfs.client.ShortCircuitCache.ShortCircuitReplicaCreator;
+import org.apache.hadoop.hdfs.client.ShortCircuitReplica;
+import org.apache.hadoop.hdfs.client.ShortCircuitReplicaInfo;
+import org.apache.hadoop.hdfs.net.DomainPeer;
 import org.apache.hadoop.hdfs.net.Peer;
-import org.apache.hadoop.hdfs.protocol.DatanodeID;
 import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
 import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
+import org.apache.hadoop.hdfs.protocol.datatransfer.InvalidEncryptionKeyException;
 import org.apache.hadoop.hdfs.protocol.datatransfer.Sender;
 import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.BlockOpResponseProto;
 import org.apache.hadoop.hdfs.protocolPB.PBHelper;
 import org.apache.hadoop.hdfs.security.token.block.BlockTokenIdentifier;
 import org.apache.hadoop.hdfs.security.token.block.InvalidBlockTokenException;
-import org.apache.hadoop.hdfs.server.common.HdfsServerConstants;
-import org.apache.hadoop.hdfs.server.datanode.BlockMetadataHeader;
 import org.apache.hadoop.hdfs.server.datanode.CachingStrategy;
 import org.apache.hadoop.io.IOUtils;
 import org.apache.hadoop.ipc.RemoteException;
 import org.apache.hadoop.net.unix.DomainSocket;
 import org.apache.hadoop.security.AccessControlException;
+import org.apache.hadoop.security.UserGroupInformation;
 import org.apache.hadoop.security.token.SecretManager.InvalidToken;
 import org.apache.hadoop.security.token.Token;
+import org.apache.hadoop.util.Time;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.base.Preconditions;
 
 
 /** 
  * Utility class to create BlockReader implementations.
  */
 @InterfaceAudience.Private
-public class BlockReaderFactory {
+public class BlockReaderFactory implements ShortCircuitReplicaCreator {
+  static final Log LOG = LogFactory.getLog(BlockReaderFactory.class);
+
+  @VisibleForTesting
+  static ShortCircuitReplicaCreator
+      createShortCircuitReplicaInfoCallback = null;
+
+  private final DFSClient.Conf conf;
+
+  /**
+   * The file name, for logging and debugging purposes.
+   */
+  private String fileName;
+
+  /**
+   * The block ID and block pool ID to use.
+   */
+  private ExtendedBlock block;
+
+  /**
+   * The block token to use for security purposes.
+   */
+  private Token<BlockTokenIdentifier> token;
+
+  /**
+   * The offset within the block to start reading at.
+   */
+  private long startOffset;
+
+  /**
+   * If false, we won't try to verify the block checksum.
+   */
+  private boolean verifyChecksum;
+
+  /**
+   * The name of this client.
+   */
+  private String clientName; 
+
+  /**
+   * The DataNode we're talking to.
+   */
+  private DatanodeInfo datanode;
+
+  /**
+   * If false, we won't try short-circuit local reads.
+   */
+  private boolean allowShortCircuitLocalReads;
+
+  /**
+   * The ClientContext to use for things like the PeerCache.
+   */
+  private ClientContext clientContext;
+
+  /**
+   * Number of bytes to read.  -1 indicates no limit.
+   */
+  private long length = -1;
+
+  /**
+   * Caching strategy to use when reading the block.
+   */
+  private CachingStrategy cachingStrategy;
+
   /**
-   * Create a new BlockReader specifically to satisfy a read.
-   * This method also sends the OP_READ_BLOCK request.
-   * 
-   * @param conf the DFSClient configuration
-   * @param file  File location
-   * @param block  The block object
-   * @param blockToken  The block token for security
-   * @param startOffset  The read offset, relative to block head
-   * @param len  The number of bytes to read, or -1 to read as many as
-   *             possible.
-   * @param bufferSize  The IO buffer size (not the client buffer size)
-   *                    Ignored except on the legacy BlockReader.
-   * @param verifyChecksum  Whether to verify checksum
-   * @param clientName  Client name.  Used for log messages.
-   * @param peer  The peer
-   * @param datanodeID  The datanode that the Peer is connected to
-   * @param domainSocketFactory  The DomainSocketFactory to notify if the Peer
-   *                             is a DomainPeer which turns out to be faulty.
-   *                             If null, no factory will be notified in this
-   *                             case.
-   * @param allowShortCircuitLocalReads  True if short-circuit local reads
-   *                                     should be allowed.
-   * @return New BlockReader instance
+   * Socket address to use to connect to peer.
    */
-  public static BlockReader newBlockReader(DFSClient.Conf conf,
-                                     String file,
-                                     ExtendedBlock block, 
-                                     Token<BlockTokenIdentifier> blockToken,
-                                     long startOffset, long len,
-                                     boolean verifyChecksum,
-                                     String clientName,
-                                     Peer peer,
-                                     DatanodeID datanodeID,
-                                     DomainSocketFactory domSockFactory,
-                                     PeerCache peerCache,
-                                     FileInputStreamCache fisCache,
-                                     boolean allowShortCircuitLocalReads,
-                                     CachingStrategy cachingStrategy)
-  throws IOException {
-    peer.setReadTimeout(conf.socketTimeout);
-    peer.setWriteTimeout(HdfsServerConstants.WRITE_TIMEOUT);
-
-    if (peer.getDomainSocket() != null) {
-      if (allowShortCircuitLocalReads && !conf.useLegacyBlockReaderLocal) {
-        // If this is a domain socket, and short-circuit local reads are 
-        // enabled, try to set up a BlockReaderLocal.
-        BlockReader reader = newShortCircuitBlockReader(conf, file,
-            block, blockToken, startOffset, len, peer, datanodeID,
-            domSockFactory, verifyChecksum, fisCache, cachingStrategy);
+  private InetSocketAddress inetSocketAddress;
+
+  /**
+   * Remote peer factory to use to create a peer, if needed.
+   */
+  private RemotePeerFactory remotePeerFactory;
+
+  /**
+   * UserGroupInformation  to use for legacy block reader local objects, if needed.
+   */
+  private UserGroupInformation userGroupInformation;
+
+  /**
+   * Configuration to use for legacy block reader local objects, if needed.
+   */
+  private Configuration configuration;
+
+  /**
+   * Information about the domain socket path we should use to connect to the
+   * local peer-- or null if we haven't examined the local domain socket.
+   */
+  private DomainSocketFactory.PathInfo pathInfo;
+
+  /**
+   * The remaining number of times that we'll try to pull a socket out of the
+   * cache.
+   */
+  private int remainingCacheTries;
+
+  public BlockReaderFactory(DFSClient.Conf conf) {
+    this.conf = conf;
+    this.remainingCacheTries = conf.nCachedConnRetry;
+  }
+
+  public BlockReaderFactory setFileName(String fileName) {
+    this.fileName = fileName;
+    return this;
+  }
+
+  public BlockReaderFactory setBlock(ExtendedBlock block) {
+    this.block = block;
+    return this;
+  }
+
+  public BlockReaderFactory setBlockToken(Token<BlockTokenIdentifier> token) {
+    this.token = token;
+    return this;
+  }
+
+  public BlockReaderFactory setStartOffset(long startOffset) {
+    this.startOffset = startOffset;
+    return this;
+  }
+
+  public BlockReaderFactory setVerifyChecksum(boolean verifyChecksum) {
+    this.verifyChecksum = verifyChecksum;
+    return this;
+  }
+
+  public BlockReaderFactory setClientName(String clientName) {
+    this.clientName = clientName;
+    return this;
+  }
+
+  public BlockReaderFactory setDatanodeInfo(DatanodeInfo datanode) {
+    this.datanode = datanode;
+    return this;
+  }
+
+  public BlockReaderFactory setAllowShortCircuitLocalReads(
+      boolean allowShortCircuitLocalReads) {
+    this.allowShortCircuitLocalReads = allowShortCircuitLocalReads;
+    return this;
+  }
+
+  public BlockReaderFactory setClientCacheContext(
+      ClientContext clientContext) {
+    this.clientContext = clientContext;
+    return this;
+  }
+
+  public BlockReaderFactory setLength(long length) {
+    this.length = length;
+    return this;
+  }
+
+  public BlockReaderFactory setCachingStrategy(
+      CachingStrategy cachingStrategy) {
+    this.cachingStrategy = cachingStrategy;
+    return this;
+  }
+
+  public BlockReaderFactory setInetSocketAddress (
+      InetSocketAddress inetSocketAddress) {
+    this.inetSocketAddress = inetSocketAddress;
+    return this;
+  }
+
+  public BlockReaderFactory setUserGroupInformation(
+      UserGroupInformation userGroupInformation) {
+    this.userGroupInformation = userGroupInformation;
+    return this;
+  }
+
+  public BlockReaderFactory setRemotePeerFactory(
+      RemotePeerFactory remotePeerFactory) {
+    this.remotePeerFactory = remotePeerFactory;
+    return this;
+  }
+
+  public BlockReaderFactory setConfiguration(
+      Configuration configuration) {
+    this.configuration = configuration;
+    return this;
+  }
+
+  /**
+   * Build a BlockReader with the given options.
+   *
+   * This function will do the best it can to create a block reader that meets
+   * all of our requirements.  We prefer short-circuit block readers
+   * (BlockReaderLocal and BlockReaderLocalLegacy) over remote ones, since the
+   * former avoid the overhead of socket communication.  If short-circuit is
+   * unavailable, our next fallback is data transfer over UNIX domain sockets,
+   * if dfs.client.domain.socket.data.traffic has been enabled.  If that doesn't
+   * work, we will try to create a remote block reader that operates over TCP
+   * sockets.
+   *
+   * There are a few caches that are important here.
+   *
+   * The ShortCircuitCache stores file descriptor objects which have been passed
+   * from the DataNode. 
+   *
+   * The DomainSocketFactory stores information about UNIX domain socket paths
+   * that we not been able to use in the past, so that we don't waste time
+   * retrying them over and over.  (Like all the caches, it does have a timeout,
+   * though.)
+   *
+   * The PeerCache stores peers that we have used in the past.  If we can reuse
+   * one of these peers, we avoid the overhead of re-opening a socket.  However,
+   * if the socket has been timed out on the remote end, our attempt to reuse
+   * the socket may end with an IOException.  For that reason, we limit our
+   * attempts at socket reuse to dfs.client.cached.conn.retry times.  After
+   * that, we create new sockets.  This avoids the problem where a thread tries
+   * to talk to a peer that it hasn't talked to in a while, and has to clean out
+   * every entry in a socket cache full of stale entries.
+   *
+   * @return The new BlockReader.  We will not return null.
+   *
+   * @throws InvalidToken
+   *             If the block token was invalid.
+   *         InvalidEncryptionKeyException
+   *             If the encryption key was invalid.
+   *         Other IOException
+   *             If there was another problem.
+   */
+  public BlockReader build() throws IOException {
+    BlockReader reader = null;
+
+    Preconditions.checkNotNull(configuration);
+    if (conf.shortCircuitLocalReads && allowShortCircuitLocalReads) {
+      if (clientContext.getUseLegacyBlockReaderLocal()) {
+        reader = getLegacyBlockReaderLocal();
         if (reader != null) {
-          // One we've constructed the short-circuit block reader, we don't
-          // need the socket any more.  So let's return it to the cache.
-          if (peerCache != null) {
-            peerCache.put(datanodeID, peer);
-          } else {
-            IOUtils.cleanup(null, peer);
+          if (LOG.isTraceEnabled()) {
+            LOG.trace(this + ": returning new legacy block reader local.");
+          }
+          return reader;
+        }
+      } else {
+        reader = getBlockReaderLocal();
+        if (reader != null) {
+          if (LOG.isTraceEnabled()) {
+            LOG.trace(this + ": returning new block reader local.");
           }
           return reader;
         }
       }
-      // If this is a domain socket and we couldn't (or didn't want to) set
-      // up a BlockReaderLocal, check that we are allowed to pass data traffic
-      // over the socket before proceeding.
-      if (!conf.domainSocketDataTraffic) {
-        throw new IOException("Because we can't do short-circuit access, " +
-          "and data traffic over domain sockets is disabled, " +
-          "we cannot use this socket to talk to " + datanodeID);
+    }
+    if (conf.domainSocketDataTraffic) {
+      reader = getRemoteBlockReaderFromDomain();
+      if (reader != null) {
+        if (LOG.isTraceEnabled()) {
+          LOG.trace(this + ": returning new remote block reader using " +
+              "UNIX domain socket on " + pathInfo.getPath());
+        }
+        return reader;
       }
     }
+    Preconditions.checkState(!DFSInputStream.tcpReadsDisabledForTesting,
+        "TCP reads were disabled for testing, but we failed to " +
+        "do a non-TCP read.");
+    return getRemoteBlockReaderFromTcp();
+  }
 
-    if (conf.useLegacyBlockReader) {
-      @SuppressWarnings("deprecation")
-      RemoteBlockReader reader = RemoteBlockReader.newBlockReader(file,
-          block, blockToken, startOffset, len, conf.ioBufferSize,
-          verifyChecksum, clientName, peer, datanodeID, peerCache,
-          cachingStrategy);
-      return reader;
-    } else {
-      return RemoteBlockReader2.newBlockReader(
-          file, block, blockToken, startOffset, len,
-          verifyChecksum, clientName, peer, datanodeID, peerCache,
-          cachingStrategy);
+  /**
+   * Get {@link BlockReaderLocalLegacy} for short circuited local reads.
+   * This block reader implements the path-based style of local reads
+   * first introduced in HDFS-2246.
+   */
+  private BlockReader getLegacyBlockReaderLocal() throws IOException {
+    if (LOG.isTraceEnabled()) {
+      LOG.trace(this + ": trying to construct BlockReaderLocalLegacy");
     }
+    if (!DFSClient.isLocalAddress(inetSocketAddress)) {
+      if (LOG.isTraceEnabled()) {
+        LOG.trace(this + ": can't construct BlockReaderLocalLegacy because " +
+            "the address " + inetSocketAddress + " is not local");
+      }
+      return null;
+    }
+    if (clientContext.getDisableLegacyBlockReaderLocal()) {
+      if (LOG.isTraceEnabled()) {
+        LOG.trace(this + ": can't construct BlockReaderLocalLegacy because " +
+            "disableLegacyBlockReaderLocal is set.");
+      }
+      return null;
+    }
+    IOException ioe = null;
+    try {
+      return BlockReaderLocalLegacy.newBlockReader(conf,
+          userGroupInformation, configuration, fileName, block, token,
+          datanode, startOffset, length);
+    } catch (RemoteException remoteException) {
+      ioe = remoteException.unwrapRemoteException(
+                InvalidToken.class, AccessControlException.class);
+    } catch (IOException e) {
+      ioe = e;
+    }
+    if ((!(ioe instanceof AccessControlException)) &&
+        isSecurityException(ioe)) {
+      // Handle security exceptions.
+      // We do not handle AccessControlException here, since
+      // BlockReaderLocalLegacy#newBlockReader uses that exception to indicate
+      // that the user is not in dfs.block.local-path-access.user, a condition
+      // which requires us to disable legacy SCR.
+      throw ioe;
+    }
+    LOG.warn(this + ": error creating legacy BlockReaderLocal.  " +
+        "Disabling legacy local reads.", ioe);
+    clientContext.setDisableLegacyBlockReaderLocal();
+    return null;
+  }
+
+  private BlockReader getBlockReaderLocal() throws InvalidToken {
+    if (LOG.isTraceEnabled()) {
+      LOG.trace(this + ": trying to construct a BlockReaderLocal " +
+          "for short-circuit reads.");
+    }
+    if (pathInfo == null) {
+      pathInfo = clientContext.getDomainSocketFactory().
+                      getPathInfo(inetSocketAddress, conf);
+    }
+    if (!pathInfo.getPathState().getUsableForShortCircuit()) {
+      if (LOG.isTraceEnabled()) {
+        LOG.trace(this + ": " + pathInfo + " is not " +
+            "usable for short circuit; giving up on BlockReaderLocal.");
+      }
+      return null;
+    }
+    ShortCircuitCache cache = clientContext.getShortCircuitCache();
+    ExtendedBlockId key = new ExtendedBlockId(block.getBlockId(), block.getBlockPoolId());
+    ShortCircuitReplicaInfo info = cache.fetchOrCreate(key, this);
+    InvalidToken exc = info.getInvalidTokenException();
+    if (exc != null) {
+      if (LOG.isTraceEnabled()) {
+        LOG.trace(this + ": got InvalidToken exception while trying to " +
+            "construct BlockReaderLocal via " + pathInfo.getPath());
+      }
+      throw exc;
+    }
+    if (info.getReplica() == null) {
+      if (LOG.isTraceEnabled()) {
+        LOG.trace(this + ": failed to get ShortCircuitReplica.  " +
+            "Cannot construct BlockReaderLocal via " + pathInfo.getPath());
+      }
+      return null;
+    }
+    return new BlockReaderLocal.Builder(conf).
+        setFilename(fileName).
+        setBlock(block).
+        setStartOffset(startOffset).
+        setShortCircuitReplica(info.getReplica()).
+        setDatanodeID(datanode).
+        setVerifyChecksum(verifyChecksum).
+        setCachingStrategy(cachingStrategy).
+        build();
+  }
+
+  /**
+   * Fetch a pair of short-circuit block descriptors from a local DataNode.
+   *
+   * @return    Null if we could not communicate with the datanode,
+   *            a new ShortCircuitReplicaInfo object otherwise.
+   *            ShortCircuitReplicaInfo objects may contain either an InvalidToken
+   *            exception, or a ShortCircuitReplica object ready to use.
+   */
+  @Override
+  public ShortCircuitReplicaInfo createShortCircuitReplicaInfo() {
+    if (createShortCircuitReplicaInfoCallback != null) {
+      ShortCircuitReplicaInfo info =
+        createShortCircuitReplicaInfoCallback.createShortCircuitReplicaInfo();
+      if (info != null) return info;
+    }
+    if (LOG.isTraceEnabled()) {
+      LOG.trace(this + ": trying to create ShortCircuitReplicaInfo.");
+    }
+    BlockReaderPeer curPeer;
+    while (true) {
+      curPeer = nextDomainPeer();
+      if (curPeer == null) break;
+      DomainPeer peer = (DomainPeer)curPeer.peer;
+      try {
+        ShortCircuitReplicaInfo info = requestFileDescriptors(peer);
+        clientContext.getPeerCache().put(datanode, peer);
+        return info;
+      } catch (IOException e) {
+        if (curPeer.fromCache) {
+          // Handle an I/O error we got when using a cached socket.
+          // These are considered less serious, because the socket may be stale.
+          if (LOG.isDebugEnabled()) {
+            LOG.debug(this + ": closing stale domain peer " + peer, e);
+          }
+          IOUtils.cleanup(LOG, peer);
+        } else {
+          // Handle an I/O error we got when using a newly created socket.
+          // We temporarily disable the domain socket path for a few minutes in
+          // this case, to prevent wasting more time on it.
+          LOG.warn(this + ": I/O error requesting file descriptors.  " + 
+              "Disabling domain socket " + peer.getDomainSocket(), e);
+          IOUtils.cleanup(LOG, peer);
+          clientContext.getDomainSocketFactory()
+              .disableDomainSocketPath(pathInfo.getPath());
+          return null;
+        }
+      }
+    }
+    return null;
   }
 
   /**
-   * Create a new short-circuit BlockReader.
-   * 
-   * Here, we ask the DataNode to pass us file descriptors over our
-   * DomainSocket.  If the DataNode declines to do so, we'll return null here;
-   * otherwise, we'll return the BlockReaderLocal.  If the DataNode declines,
-   * this function will inform the DomainSocketFactory that short-circuit local
-   * reads are disabled for this DataNode, so that we don't ask again.
-   * 
-   * @param conf               the configuration.
-   * @param file               the file name. Used in log messages.
-   * @param block              The block object.
-   * @param blockToken         The block token for security.
-   * @param startOffset        The read offset, relative to block head.
-   * @param len                The number of bytes to read, or -1 to read 
-   *                           as many as possible.
-   * @param peer               The peer to use.
-   * @param datanodeID         The datanode that the Peer is connected to.
-   * @param domSockFactory     The DomainSocketFactory to notify if the Peer
-   *                           is a DomainPeer which turns out to be faulty.
-   *                           If null, no factory will be notified in this
-   *                           case.
-   * @param verifyChecksum     True if we should verify the checksums.
-   *                           Note: even if this is true, when
-   *                           DFS_CLIENT_READ_CHECKSUM_SKIP_CHECKSUM_KEY is
-   *                           set or the block is mlocked, we will skip
-   *                           checksums.
+   * Request file descriptors from a DomainPeer.
    *
-   * @return                   The BlockReaderLocal, or null if the
-   *                           DataNode declined to provide short-circuit
-   *                           access.
-   * @throws IOException       If there was a communication error.
+   * @return  A ShortCircuitReplica object if we could communicate with the
+   *          datanode; null, otherwise. 
+   * @throws  IOException If we encountered an I/O exception while communicating
+   *          with the datanode.
    */
-  private static BlockReaderLocal newShortCircuitBlockReader(
-      DFSClient.Conf conf, String file, ExtendedBlock block,
-      Token<BlockTokenIdentifier> blockToken, long startOffset,
-      long len, Peer peer, DatanodeID datanodeID,
-      DomainSocketFactory domSockFactory, boolean verifyChecksum,
-      FileInputStreamCache fisCache,
-      CachingStrategy cachingStrategy) throws IOException {
+  private ShortCircuitReplicaInfo requestFileDescriptors(DomainPeer peer)
+        throws IOException {
     final DataOutputStream out =
-        new DataOutputStream(new BufferedOutputStream(
-          peer.getOutputStream()));
-    new Sender(out).requestShortCircuitFds(block, blockToken, 1);
-    DataInputStream in =
-        new DataInputStream(peer.getInputStream());
+        new DataOutputStream(new BufferedOutputStream(peer.getOutputStream()));
+    new Sender(out).requestShortCircuitFds(block, token, 1);
+    DataInputStream in = new DataInputStream(peer.getInputStream());
     BlockOpResponseProto resp = BlockOpResponseProto.parseFrom(
         PBHelper.vintPrefixed(in));
     DomainSocket sock = peer.getDomainSocket();
     switch (resp.getStatus()) {
     case SUCCESS:
-      BlockReaderLocal reader = null;
       byte buf[] = new byte[1];
       FileInputStream fis[] = new FileInputStream[2];
       sock.recvFileInputStreams(fis, buf, 0, buf.length);
+      ShortCircuitReplica replica = null;
       try {
-        reader = new BlockReaderLocal.Builder(conf).
-            setFilename(file).
-            setBlock(block).
-            setStartOffset(startOffset).
-            setStreams(fis).
-            setDatanodeID(datanodeID).
-            setVerifyChecksum(verifyChecksum).
-            setBlockMetadataHeader(
-                BlockMetadataHeader.preadHeader(fis[1].getChannel())).
-            setFileInputStreamCache(fisCache).
-            setCachingStrategy(cachingStrategy).
-            build();
+        ExtendedBlockId key = new ExtendedBlockId(block.getBlockId(), block.getBlockPoolId());
+        replica = new ShortCircuitReplica(key, fis[0], fis[1],
+            clientContext.getShortCircuitCache(), Time.monotonicNow());
+      } catch (IOException e) {
+        // This indicates an error reading from disk, or a format error.  Since
+        // it's not a socket communication problem, we return null rather than
+        // throwing an exception.
+        LOG.warn(this + ": error creating ShortCircuitReplica.", e);
+        return null;
       } finally {
-        if (reader == null) {
+        if (replica == null) {
           IOUtils.cleanup(DFSClient.LOG, fis[0], fis[1]);
         }
       }
-      return reader;
+      return new ShortCircuitReplicaInfo(replica);
     case ERROR_UNSUPPORTED:
       if (!resp.hasShortCircuitAccessVersion()) {
-        DFSClient.LOG.warn("short-circuit read access is disabled for " +
-            "DataNode " + datanodeID + ".  reason: " + resp.getMessage());
-        domSockFactory.disableShortCircuitForPath(sock.getPath());
+        LOG.warn("short-circuit read access is disabled for " +
+            "DataNode " + datanode + ".  reason: " + resp.getMessage());
+        clientContext.getDomainSocketFactory()
+            .disableShortCircuitForPath(pathInfo.getPath());
       } else {
-        DFSClient.LOG.warn("short-circuit read access for the file " +
-            file + " is disabled for DataNode " + datanodeID +
+        LOG.warn("short-circuit read access for the file " +
+            fileName + " is disabled for DataNode " + datanode +
             ".  reason: " + resp.getMessage());
       }
       return null;
     case ERROR_ACCESS_TOKEN:
       String msg = "access control error while " +
           "attempting to set up short-circuit access to " +
-          file + resp.getMessage();
-      DFSClient.LOG.debug(msg);
-      throw new InvalidBlockTokenException(msg);
+          fileName + resp.getMessage();
+      if (LOG.isDebugEnabled()) {
+        LOG.debug(this + ":" + msg);
+      }
+      return new ShortCircuitReplicaInfo(new InvalidToken(msg));
     default:
-      DFSClient.LOG.warn("error while attempting to set up short-circuit " +
-          "access to " + file + ": " + resp.getMessage());
-      domSockFactory.disableShortCircuitForPath(sock.getPath());
+      LOG.warn(this + "unknown response code " + resp.getStatus() + " while " +
+          "attempting to set up short-circuit access. " + resp.getMessage());
+      clientContext.getDomainSocketFactory()
+          .disableShortCircuitForPath(pathInfo.getPath());
+      return null;
+    }
+  }
+
+  /**
+   * Get a RemoteBlockReader that communicates over a UNIX domain socket.
+   *
+   * @return The new BlockReader, or null if we failed to create the block
+   * reader.
+   *
+   * @throws InvalidToken    If the block token was invalid.
+   * Potentially other security-related execptions.
+   */
+  private BlockReader getRemoteBlockReaderFromDomain() throws IOException {
+    if (pathInfo == null) {
+      pathInfo = clientContext.getDomainSocketFactory().
+                      getPathInfo(inetSocketAddress, conf);
+    }
+    if (!pathInfo.getPathState().getUsableForDataTransfer()) {
+      if (LOG.isTraceEnabled()) {
+        LOG.trace(this + ": not trying to create a remote block reader " +
+            "because the UNIX domain socket at " + pathInfo +
+            " is not usable.");
+      }
       return null;
     }
+    if (LOG.isTraceEnabled()) {
+      LOG.trace(this + ": trying to create a remote block reader from the " +
+          "UNIX domain socket at " + pathInfo.getPath());
+    }
+
+    while (true) {
+      BlockReaderPeer curPeer = nextDomainPeer();
+      if (curPeer == null) break;
+      DomainPeer peer = (DomainPeer)curPeer.peer;
+      BlockReader blockReader = null;
+      try {
+        blockReader = getRemoteBlockReader(peer);
+        return blockReader;
+      } catch (IOException ioe) {
+        IOUtils.cleanup(LOG, peer);
+        if (isSecurityException(ioe)) {
+          if (LOG.isTraceEnabled()) {
+            LOG.trace(this + ": got security exception while constructing " +
+                "a remote block reader from the unix domain socket at " +
+                pathInfo.getPath(), ioe);
+          }
+          throw ioe;
+        }
+        if (curPeer.fromCache) {
+          // Handle an I/O error we got when using a cached peer.  These are
+          // considered less serious, because the underlying socket may be stale.
+          if (LOG.isDebugEnabled()) {
+            LOG.debug("Closed potentially stale domain peer " + peer, ioe);
+          }
+        } else {
+          // Handle an I/O error we got when using a newly created domain peer.
+          // We temporarily disable the domain socket path for a few minutes in
+          // this case, to prevent wasting more time on it.
+          LOG.warn("I/O error constructing remote block reader.  Disabling " +
+              "domain socket " + peer.getDomainSocket(), ioe);
+          clientContext.getDomainSocketFactory()
+              .disableDomainSocketPath(pathInfo.getPath());
+          return null;
+        }
+      } finally {
+        if (blockReader == null) {
+          IOUtils.cleanup(LOG, peer);
+        }
+      }
+    }
+    return null;
+  }
+
+  /**
+   * Get a RemoteBlockReader that communicates over a TCP socket.
+   *
+   * @return The new BlockReader.  We will not return null, but instead throw
+   *         an exception if this fails.
+   *
+   * @throws InvalidToken
+   *             If the block token was invalid.
+   *         InvalidEncryptionKeyException
+   *             If the encryption key was invalid.
+   *         Other IOException
+   *             If there was another problem.
+   */
+  private BlockReader getRemoteBlockReaderFromTcp() throws IOException {
+    if (LOG.isTraceEnabled()) {
+      LOG.trace(this + ": trying to create a remote block reader from a " +
+          "TCP socket");
+    }
+    BlockReader blockReader = null;
+    while (true) {
+      BlockReaderPeer curPeer = null;
+      Peer peer = null;
+      try {
+        curPeer = nextTcpPeer();
+        if (curPeer == null) break;
+        peer = curPeer.peer;
+        blockReader = getRemoteBlockReader(peer);
+        return blockReader;
+      } catch (IOException ioe) {
+        if (isSecurityException(ioe)) {
+          if (LOG.isTraceEnabled()) {
+            LOG.trace(this + ": got security exception while constructing " +
+                "a remote block reader from " + peer, ioe);
+          }
+          throw ioe;
+        }
+        if ((curPeer != null) && curPeer.fromCache) {
+          // Handle an I/O error we got when using a cached peer.  These are
+          // considered less serious, because the underlying socket may be
+          // stale.
+          if (LOG.isDebugEnabled()) {
+            LOG.debug("Closed potentially stale remote peer " + peer, ioe);
+          }
+        } else {
+          // Handle an I/O error we got when using a newly created peer.
+          LOG.warn("I/O error constructing remote block reader.", ioe);
+          throw ioe;
+        }
+      } finally {
+        if (blockReader == null) {
+          IOUtils.cleanup(LOG, peer);
+        }
+      }
+    }
+    return null;
+  }
+
+  private static class BlockReaderPeer {
+    final Peer peer;
+    final boolean fromCache;
+    
+    BlockReaderPeer(Peer peer, boolean fromCache) {
+      this.peer = peer;
+      this.fromCache = fromCache;
+    }
+  }
+
+  /**
+   * Get the next DomainPeer-- either from the cache or by creating it.
+   *
+   * @return the next DomainPeer, or null if we could not construct one.
+   */
+  private BlockReaderPeer nextDomainPeer() {
+    if (remainingCacheTries > 0) {
+      Peer peer = clientContext.getPeerCache().get(datanode, true);
+      if (peer != null) {
+        remainingCacheTries--;
+        if (LOG.isTraceEnabled()) {
+          LOG.trace("nextDomainPeer: reusing existing peer " + peer);
+        }
+        return new BlockReaderPeer(peer, true);
+      }
+    }
+    DomainSocket sock = clientContext.getDomainSocketFactory().
+        createSocket(pathInfo, conf.socketTimeout);
+    if (sock == null) return null;
+    return new BlockReaderPeer(new DomainPeer(sock), false);
+  }
+
+  /**
+   * Get the next TCP-based peer-- either from the cache or by creating it.
+   *
+   * @return the next Peer, or null if we could not construct one.
+   *
+   * @throws IOException  If there was an error while constructing the peer
+   *                      (such as an InvalidEncryptionKeyException)
+   */
+  private BlockReaderPeer nextTcpPeer() throws IOException {
+    if (remainingCacheTries > 0) {
+      Peer peer = clientContext.getPeerCache().get(datanode, false);
+      if (peer != null) {
+        remainingCacheTries--;
+        if (LOG.isTraceEnabled()) {
+          LOG.trace("nextTcpPeer: reusing existing peer " + peer);
+        }
+        return new BlockReaderPeer(peer, true);
+      }
+    }
+    try {
+      Peer peer = remotePeerFactory.newConnectedPeer(inetSocketAddress);
+      if (LOG.isTraceEnabled()) {
+        LOG.trace("nextTcpPeer: created newConnectedPeer " + peer);
+      }
+      return new BlockReaderPeer(peer, false);
+    } catch (IOException e) {
+      if (LOG.isTraceEnabled()) {
+        LOG.trace("nextTcpPeer: failed to create newConnectedPeer " +
+                  "connected to " + datanode);
+      }
+      throw e;
+    }
+  }
+
+  /**
+   * Determine if an exception is security-related.
+   *
+   * We need to handle these exceptions differently than other IOExceptions.
+   * They don't indicate a communication problem.  Instead, they mean that there
+   * is some action the client needs to take, such as refetching block tokens,
+   * renewing encryption keys, etc.
+   *
+   * @param ioe    The exception
+   * @return       True only if the exception is security-related.
+   */
+  private static boolean isSecurityException(IOException ioe) {
+    return (ioe instanceof InvalidToken) ||
+            (ioe instanceof InvalidEncryptionKeyException) ||
+            (ioe instanceof InvalidBlockTokenException) ||
+            (ioe instanceof AccessControlException);
+  }
+
+  @SuppressWarnings("deprecation")
+  private BlockReader getRemoteBlockReader(Peer peer) throws IOException {
+    if (conf.useLegacyBlockReader) {
+      return RemoteBlockReader.newBlockReader(fileName,
+          block, token, startOffset, length, conf.ioBufferSize,
+          verifyChecksum, clientName, peer, datanode,
+          clientContext.getPeerCache(), cachingStrategy);
+    } else {
+      return RemoteBlockReader2.newBlockReader(
+          fileName, block, token, startOffset, length,
+          verifyChecksum, clientName, peer, datanode,
+          clientContext.getPeerCache(), cachingStrategy);
+    }
+  }
+
+  @Override
+  public String toString() {
+    return "BlockReaderFactory(fileName=" + fileName + ", block=" + block + ")";
   }
 
   /**
@@ -246,23 +777,4 @@ public class BlockReaderFactory {
       final String poolId, final long blockId) {
     return s.toString() + ":" + poolId + ":" + blockId;
   }
-
-  /**
-   * Get {@link BlockReaderLocalLegacy} for short circuited local reads.
-   * This block reader implements the path-based style of local reads
-   * first introduced in HDFS-2246.
-   */
-  static BlockReader getLegacyBlockReaderLocal(DFSClient dfsClient,
-      String src, ExtendedBlock blk,
-      Token<BlockTokenIdentifier> accessToken, DatanodeInfo chosenNode,
-      long offsetIntoBlock) throws InvalidToken, IOException {
-    try {
-      final long length = blk.getNumBytes() - offsetIntoBlock;
-      return BlockReaderLocalLegacy.newBlockReader(dfsClient, src, blk,
-          accessToken, chosenNode, offsetIntoBlock, length);
-    } catch (RemoteException re) {
-      throw re.unwrapRemoteException(InvalidToken.class,
-          AccessControlException.class);
-    }
-  }
 }

+ 15 - 70
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/BlockReaderLocal.java

@@ -28,8 +28,9 @@ import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.fs.ReadOption;
 import org.apache.hadoop.hdfs.client.ClientMmap;
+import org.apache.hadoop.hdfs.client.ShortCircuitCache;
+import org.apache.hadoop.hdfs.client.ShortCircuitReplica;
 import org.apache.hadoop.hdfs.DFSClient.Conf;
-import org.apache.hadoop.hdfs.client.ClientMmapManager;
 import org.apache.hadoop.hdfs.protocol.DatanodeID;
 import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
 import org.apache.hadoop.hdfs.server.datanode.BlockMetadataHeader;
@@ -67,12 +68,10 @@ class BlockReaderLocal implements BlockReader {
     private boolean verifyChecksum;
     private int maxReadahead;
     private String filename;
-    private FileInputStream streams[];
+    private ShortCircuitReplica replica;
     private long dataPos;
     private DatanodeID datanodeID;
-    private FileInputStreamCache fisCache;
     private boolean mlocked;
-    private BlockMetadataHeader header;
     private ExtendedBlock block;
 
     public Builder(Conf conf) {
@@ -99,8 +98,8 @@ class BlockReaderLocal implements BlockReader {
       return this;
     }
 
-    public Builder setStreams(FileInputStream streams[]) {
-      this.streams = streams;
+    public Builder setShortCircuitReplica(ShortCircuitReplica replica) {
+      this.replica = replica;
       return this;
     }
 
@@ -114,30 +113,18 @@ class BlockReaderLocal implements BlockReader {
       return this;
     }
 
-    public Builder setFileInputStreamCache(FileInputStreamCache fisCache) {
-      this.fisCache = fisCache;
-      return this;
-    }
-
     public Builder setMlocked(boolean mlocked) {
       this.mlocked = mlocked;
       return this;
     }
 
-    public Builder setBlockMetadataHeader(BlockMetadataHeader header) {
-      this.header = header;
-      return this;
-    }
-
     public Builder setBlock(ExtendedBlock block) {
       this.block = block;
       return this;
     }
 
     public BlockReaderLocal build() {
-      Preconditions.checkNotNull(streams);
-      Preconditions.checkArgument(streams.length == 2);
-      Preconditions.checkNotNull(header);
+      Preconditions.checkNotNull(replica);
       return new BlockReaderLocal(this);
     }
   }
@@ -147,7 +134,7 @@ class BlockReaderLocal implements BlockReader {
   /**
    * Pair of streams for this block.
    */
-  private final FileInputStream streams[];
+  private final ShortCircuitReplica replica;
 
   /**
    * The data FileChannel.
@@ -207,12 +194,6 @@ class BlockReaderLocal implements BlockReader {
    */
   private int checksumSize;
 
-  /**
-   * FileInputStream cache to return the streams to upon closing,
-   * or null if we should just close them unconditionally.
-   */
-  private final FileInputStreamCache fisCache;
-
   /**
    * Maximum number of chunks to allocate.
    *
@@ -257,20 +238,18 @@ class BlockReaderLocal implements BlockReader {
    */
   private ByteBuffer checksumBuf;
 
-  private boolean mmapDisabled = false;
-
   private BlockReaderLocal(Builder builder) {
-    this.streams = builder.streams;
-    this.dataIn = builder.streams[0].getChannel();
+    this.replica = builder.replica;
+    this.dataIn = replica.getDataStream().getChannel();
     this.dataPos = builder.dataPos;
-    this.checksumIn = builder.streams[1].getChannel();
-    this.checksum = builder.header.getChecksum();
+    this.checksumIn = replica.getMetaStream().getChannel();
+    BlockMetadataHeader header = builder.replica.getMetaHeader();
+    this.checksum = header.getChecksum();
     this.verifyChecksum = builder.verifyChecksum &&
         (this.checksum.getChecksumType().id != DataChecksum.CHECKSUM_NULL);
     this.mlocked = new AtomicBoolean(builder.mlocked);
     this.filename = builder.filename;
     this.datanodeID = builder.datanodeID;
-    this.fisCache = builder.fisCache;
     this.block = builder.block;
     this.bytesPerChecksum = checksum.getBytesPerChecksum();
     this.checksumSize = checksum.getChecksumSize();
@@ -642,20 +621,7 @@ class BlockReaderLocal implements BlockReader {
     if (LOG.isTraceEnabled()) {
       LOG.trace("close(filename=" + filename + ", block=" + block + ")");
     }
-    if (clientMmap != null) {
-      clientMmap.unref();
-      clientMmap = null;
-    }
-    if (fisCache != null) {
-      if (LOG.isDebugEnabled()) {
-        LOG.debug("putting FileInputStream for " + filename +
-            " back into FileInputStreamCache");
-      }
-      fisCache.put(datanodeID, block, streams);
-    } else {
-      LOG.debug("closing FileInputStream for " + filename);
-      IOUtils.cleanup(LOG, dataIn, checksumIn);
-    }
+    replica.unref();
     freeDataBufIfExists();
     freeChecksumBufIfExists();
   }
@@ -683,8 +649,7 @@ class BlockReaderLocal implements BlockReader {
   }
 
   @Override
-  public synchronized ClientMmap getClientMmap(EnumSet<ReadOption> opts,
-        ClientMmapManager mmapManager) {
+  public ClientMmap getClientMmap(EnumSet<ReadOption> opts) {
     if ((!opts.contains(ReadOption.SKIP_CHECKSUMS)) &&
           verifyChecksum && (!mlocked.get())) {
       if (LOG.isTraceEnabled()) {
@@ -694,27 +659,7 @@ class BlockReaderLocal implements BlockReader {
       }
       return null;
     }
-    if (clientMmap == null) {
-      if (mmapDisabled) {
-        return null;
-      }
-      try {
-        clientMmap = mmapManager.fetch(datanodeID, block, streams[0]);
-        if (clientMmap == null) {
-          mmapDisabled = true;
-          return null;
-        }
-      } catch (InterruptedException e) {
-        LOG.error("Interrupted while setting up mmap for " + filename, e);
-        Thread.currentThread().interrupt();
-        return null;
-      } catch (IOException e) {
-        LOG.error("unable to set up mmap for " + filename, e);
-        mmapDisabled = true;
-        return null;
-      }
-    }
-    return clientMmap;
+    return replica.getOrCreateClientMmap();
   }
 
   /**

+ 11 - 11
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/BlockReaderLocalLegacy.java

@@ -31,7 +31,6 @@ import java.util.Map;
 
 import org.apache.hadoop.fs.ReadOption;
 import org.apache.hadoop.hdfs.client.ClientMmap;
-import org.apache.hadoop.hdfs.client.ClientMmapManager;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.conf.Configuration;
@@ -175,19 +174,21 @@ class BlockReaderLocalLegacy implements BlockReader {
   /**
    * The only way this object can be instantiated.
    */
-  static BlockReaderLocalLegacy newBlockReader(DFSClient dfsClient,
-      String file, ExtendedBlock blk, Token<BlockTokenIdentifier> token,
-      DatanodeInfo node, long startOffset, long length)
-      throws IOException {
-    final DFSClient.Conf conf = dfsClient.getConf();
-
+  static BlockReaderLocalLegacy newBlockReader(DFSClient.Conf conf,
+      UserGroupInformation userGroupInformation,
+      Configuration configuration, String file, ExtendedBlock blk,
+      Token<BlockTokenIdentifier> token, DatanodeInfo node, 
+      long startOffset, long length) throws IOException {
     LocalDatanodeInfo localDatanodeInfo = getLocalDatanodeInfo(node
         .getIpcPort());
     // check the cache first
     BlockLocalPathInfo pathinfo = localDatanodeInfo.getBlockLocalPathInfo(blk);
     if (pathinfo == null) {
-      pathinfo = getBlockPathInfo(dfsClient.ugi, blk, node,
-          dfsClient.getConfiguration(), dfsClient.getHdfsTimeout(), token,
+      if (userGroupInformation == null) {
+        userGroupInformation = UserGroupInformation.getCurrentUser();
+      }
+      pathinfo = getBlockPathInfo(userGroupInformation, blk, node,
+          configuration, conf.hdfsTimeout, token,
           conf.connectToDnViaHostname);
     }
 
@@ -708,8 +709,7 @@ class BlockReaderLocalLegacy implements BlockReader {
   }
 
   @Override
-  public ClientMmap getClientMmap(EnumSet<ReadOption> opts,
-        ClientMmapManager mmapManager) {
+  public ClientMmap getClientMmap(EnumSet<ReadOption> opts) {
     return null;
   }
 }

+ 204 - 0
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/ClientContext.java

@@ -0,0 +1,204 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs;
+
+import com.google.common.annotations.VisibleForTesting;
+
+import java.util.HashMap;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.hdfs.DFSClient;
+import org.apache.hadoop.hdfs.DFSClient.Conf;
+import org.apache.hadoop.hdfs.client.ShortCircuitCache;
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.conf.Configuration;
+
+/**
+ * ClientContext contains context information for a client.
+ * 
+ * This allows us to share caches such as the socket cache across
+ * DFSClient instances.
+ */
+@InterfaceAudience.Private
+public class ClientContext {
+  private static final Log LOG = LogFactory.getLog(ClientContext.class);
+
+  /**
+   * Global map of context names to caches contexts.
+   */
+  private final static HashMap<String, ClientContext> CACHES =
+      new HashMap<String, ClientContext>();
+
+  /**
+   * Name of context.
+   */
+  private final String name;
+
+  /**
+   * String representation of the configuration.
+   */
+  private final String confString;
+
+  /**
+   * Caches short-circuit file descriptors, mmap regions.
+   */
+  private final ShortCircuitCache shortCircuitCache;
+
+  /**
+   * Caches TCP and UNIX domain sockets for reuse.
+   */
+  private final PeerCache peerCache;
+
+  /**
+   * Stores information about socket paths.
+   */
+  private final DomainSocketFactory domainSocketFactory;
+
+  /**
+   * True if we should use the legacy BlockReaderLocal.
+   */
+  private final boolean useLegacyBlockReaderLocal;
+
+  /**
+   * True if the legacy BlockReaderLocal is disabled.
+   *
+   * The legacy block reader local gets disabled completely whenever there is an
+   * error or miscommunication.  The new block reader local code handles this
+   * case more gracefully inside DomainSocketFactory.
+   */
+  private volatile boolean disableLegacyBlockReaderLocal = false;
+
+  /**
+   * Whether or not we complained about a DFSClient fetching a CacheContext that
+   * didn't match its config values yet.
+   */
+  private boolean printedConfWarning = false;
+
+  private ClientContext(String name, Conf conf) {
+    this.name = name;
+    this.confString = confAsString(conf);
+    this.shortCircuitCache = new ShortCircuitCache(
+        conf.shortCircuitStreamsCacheSize,
+        conf.shortCircuitStreamsCacheExpiryMs,
+        conf.shortCircuitMmapCacheSize,
+        conf.shortCircuitMmapCacheExpiryMs,
+        conf.shortCircuitMmapCacheRetryTimeout,
+        conf.shortCircuitCacheStaleThresholdMs);
+    this.peerCache =
+          new PeerCache(conf.socketCacheCapacity, conf.socketCacheExpiry);
+    this.useLegacyBlockReaderLocal = conf.useLegacyBlockReaderLocal;
+    this.domainSocketFactory = new DomainSocketFactory(conf);
+  }
+
+  public static String confAsString(Conf conf) {
+    StringBuilder builder = new StringBuilder();
+    builder.append("shortCircuitStreamsCacheSize = ").
+      append(conf.shortCircuitStreamsCacheSize).
+      append(", shortCircuitStreamsCacheExpiryMs = ").
+      append(conf.shortCircuitStreamsCacheExpiryMs).
+      append(", shortCircuitMmapCacheSize = ").
+      append(conf.shortCircuitMmapCacheSize).
+      append(", shortCircuitMmapCacheExpiryMs = ").
+      append(conf.shortCircuitMmapCacheExpiryMs).
+      append(", shortCircuitMmapCacheRetryTimeout = ").
+      append(conf.shortCircuitMmapCacheRetryTimeout).
+      append(", shortCircuitCacheStaleThresholdMs = ").
+      append(conf.shortCircuitCacheStaleThresholdMs).
+      append(", socketCacheCapacity = ").
+      append(conf.socketCacheCapacity).
+      append(", socketCacheExpiry = ").
+      append(conf.socketCacheExpiry).
+      append(", shortCircuitLocalReads = ").
+      append(conf.shortCircuitLocalReads).
+      append(", useLegacyBlockReaderLocal = ").
+      append(conf.useLegacyBlockReaderLocal).
+      append(", domainSocketDataTraffic = ").
+      append(conf.domainSocketDataTraffic);
+
+    return builder.toString();
+  }
+
+  public static ClientContext get(String name, Conf conf) {
+    ClientContext context;
+    synchronized(ClientContext.class) {
+      context = CACHES.get(name);
+      if (context == null) {
+        context = new ClientContext(name, conf);
+        CACHES.put(name, context);
+      } else {
+        context.printConfWarningIfNeeded(conf);
+      }
+    }
+    return context;
+  }
+
+  /**
+   * Get a client context, from a Configuration object.
+   *
+   * This method is less efficient than the version which takes a DFSClient#Conf
+   * object, and should be mostly used by tests.
+   */
+  @VisibleForTesting
+  public static ClientContext getFromConf(Configuration conf) {
+    return get(conf.get(DFSConfigKeys.DFS_CLIENT_CONTEXT,
+        DFSConfigKeys.DFS_CLIENT_CONTEXT_DEFAULT),
+            new DFSClient.Conf(conf));
+  }
+
+  private void printConfWarningIfNeeded(Conf conf) {
+    String existing = this.getConfString();
+    String requested = confAsString(conf);
+    if (!existing.equals(requested)) {
+      if (!printedConfWarning) {
+        printedConfWarning = true;
+        LOG.warn("Existing client context '" + name + "' does not match " +
+            "requested configuration.  Existing: " + existing + 
+            ", Requested: " + requested);
+      }
+    }
+  }
+
+  public String getConfString() {
+    return confString;
+  }
+
+  public ShortCircuitCache getShortCircuitCache() {
+    return shortCircuitCache;
+  }
+
+  public PeerCache getPeerCache() {
+    return peerCache;
+  }
+
+  public boolean getUseLegacyBlockReaderLocal() {
+    return useLegacyBlockReaderLocal;
+  }
+
+  public boolean getDisableLegacyBlockReaderLocal() {
+    return disableLegacyBlockReaderLocal;
+  }
+
+  public void setDisableLegacyBlockReaderLocal() {
+    disableLegacyBlockReaderLocal = true;
+  }
+
+  public DomainSocketFactory getDomainSocketFactory() {
+    return domainSocketFactory;
+  }
+}

+ 52 - 74
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSClient.java

@@ -56,6 +56,8 @@ import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CLIENT_WRITE_PACKET_SIZE_
 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_SOCKET_WRITE_TIMEOUT_KEY;
 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_REPLICATION_DEFAULT;
 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_REPLICATION_KEY;
+import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CLIENT_CONTEXT;
+import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CLIENT_CONTEXT_DEFAULT;
 
 import java.io.BufferedOutputStream;
 import java.io.DataInputStream;
@@ -108,9 +110,10 @@ import org.apache.hadoop.fs.RemoteIterator;
 import org.apache.hadoop.fs.UnresolvedLinkException;
 import org.apache.hadoop.fs.VolumeId;
 import org.apache.hadoop.fs.permission.FsPermission;
-import org.apache.hadoop.hdfs.client.ClientMmapManager;
 import org.apache.hadoop.hdfs.client.HdfsDataInputStream;
 import org.apache.hadoop.hdfs.client.HdfsDataOutputStream;
+import org.apache.hadoop.hdfs.net.Peer;
+import org.apache.hadoop.hdfs.net.TcpPeerServer;
 import org.apache.hadoop.hdfs.protocol.CacheDirectiveEntry;
 import org.apache.hadoop.hdfs.protocol.CacheDirectiveInfo;
 import org.apache.hadoop.hdfs.protocol.CacheDirectiveIterator;
@@ -191,7 +194,7 @@ import com.google.common.net.InetAddresses;
  *
  ********************************************************/
 @InterfaceAudience.Private
-public class DFSClient implements java.io.Closeable {
+public class DFSClient implements java.io.Closeable, RemotePeerFactory {
   public static final Log LOG = LogFactory.getLog(DFSClient.class);
   public static final long SERVER_DEFAULTS_VALIDITY_PERIOD = 60 * 60 * 1000L; // 1 hour
   static final int TCP_WINDOW_SIZE = 128 * 1024; // 128 KB
@@ -212,50 +215,13 @@ public class DFSClient implements java.io.Closeable {
   final ReplaceDatanodeOnFailure dtpReplaceDatanodeOnFailure;
   final FileSystem.Statistics stats;
   private final String authority;
-  final PeerCache peerCache;
   private Random r = new Random();
   private SocketAddress[] localInterfaceAddrs;
   private DataEncryptionKey encryptionKey;
-  private boolean shouldUseLegacyBlockReaderLocal;
   private final CachingStrategy defaultReadCachingStrategy;
   private final CachingStrategy defaultWriteCachingStrategy;
-  private ClientMmapManager mmapManager;
+  private final ClientContext clientContext;
   
-  private static final ClientMmapManagerFactory MMAP_MANAGER_FACTORY =
-      new ClientMmapManagerFactory();
-
-  private static final class ClientMmapManagerFactory {
-    private ClientMmapManager mmapManager = null;
-    /**
-     * Tracks the number of users of mmapManager.
-     */
-    private int refcnt = 0;
-
-    synchronized ClientMmapManager get(Configuration conf) {
-      if (refcnt++ == 0) {
-        mmapManager = ClientMmapManager.fromConf(conf);
-      } else {
-        String mismatches = mmapManager.verifyConfigurationMatches(conf);
-        if (!mismatches.isEmpty()) {
-          LOG.warn("The ClientMmapManager settings you specified " +
-            "have been ignored because another thread created the " +
-            "ClientMmapManager first.  " + mismatches);
-        }
-      }
-      return mmapManager;
-    }
-    
-    synchronized void unref(ClientMmapManager mmapManager) {
-      if (this.mmapManager != mmapManager) {
-        throw new IllegalArgumentException();
-      }
-      if (--refcnt == 0) {
-        IOUtils.cleanup(LOG, mmapManager);
-        mmapManager = null;
-      }
-    }
-  }
-
   /**
    * DFSClient configuration 
    */
@@ -300,6 +266,11 @@ public class DFSClient implements java.io.Closeable {
     final boolean domainSocketDataTraffic;
     final int shortCircuitStreamsCacheSize;
     final long shortCircuitStreamsCacheExpiryMs; 
+    
+    final int shortCircuitMmapCacheSize;
+    final long shortCircuitMmapCacheExpiryMs;
+    final long shortCircuitMmapCacheRetryTimeout;
+    final long shortCircuitCacheStaleThresholdMs;
 
     public Conf(Configuration conf) {
       // The hdfsTimeout is currently the same as the ipc timeout 
@@ -416,6 +387,18 @@ public class DFSClient implements java.io.Closeable {
       shortCircuitStreamsCacheExpiryMs = conf.getLong(
           DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_EXPIRY_MS_KEY,
           DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_EXPIRY_MS_DEFAULT);
+      shortCircuitMmapCacheSize = conf.getInt(
+          DFSConfigKeys.DFS_CLIENT_MMAP_CACHE_SIZE,
+          DFSConfigKeys.DFS_CLIENT_MMAP_CACHE_SIZE_DEFAULT);
+      shortCircuitMmapCacheExpiryMs = conf.getLong(
+          DFSConfigKeys.DFS_CLIENT_MMAP_CACHE_TIMEOUT_MS,
+          DFSConfigKeys.DFS_CLIENT_MMAP_CACHE_TIMEOUT_MS_DEFAULT);
+      shortCircuitMmapCacheRetryTimeout = conf.getLong(
+          DFSConfigKeys.DFS_CLIENT_MMAP_RETRY_TIMEOUT_MS,
+          DFSConfigKeys.DFS_CLIENT_MMAP_RETRY_TIMEOUT_MS_DEFAULT);
+      shortCircuitCacheStaleThresholdMs = conf.getLong(
+          DFSConfigKeys.DFS_CLIENT_SHORT_CIRCUIT_REPLICA_STALE_THRESHOLD_MS,
+          DFSConfigKeys.DFS_CLIENT_SHORT_CIRCUIT_REPLICA_STALE_THRESHOLD_MS_DEFAULT);
     }
 
     private DataChecksum.Type getChecksumType(Configuration conf) {
@@ -464,11 +447,11 @@ public class DFSClient implements java.io.Closeable {
   public Conf getConf() {
     return dfsClientConf;
   }
-  
+
   Configuration getConfiguration() {
     return conf;
   }
-  
+
   /**
    * A map from file names to {@link DFSOutputStream} objects
    * that are currently being written by this client.
@@ -477,8 +460,6 @@ public class DFSClient implements java.io.Closeable {
   private final Map<String, DFSOutputStream> filesBeingWritten
       = new HashMap<String, DFSOutputStream>();
 
-  private final DomainSocketFactory domainSocketFactory;
-  
   /**
    * Same as this(NameNode.getAddress(conf), conf);
    * @see #DFSClient(InetSocketAddress, Configuration)
@@ -526,8 +507,6 @@ public class DFSClient implements java.io.Closeable {
     throws IOException {
     // Copy only the required DFSClient configuration
     this.dfsClientConf = new Conf(conf);
-    this.shouldUseLegacyBlockReaderLocal = 
-        this.dfsClientConf.useLegacyBlockReaderLocal;
     if (this.dfsClientConf.useLegacyBlockReaderLocal) {
       LOG.debug("Using legacy short-circuit local reads.");
     }
@@ -572,9 +551,6 @@ public class DFSClient implements java.io.Closeable {
       this.namenode = proxyInfo.getProxy();
     }
 
-    // read directly from the block file if configured.
-    this.domainSocketFactory = new DomainSocketFactory(dfsClientConf);
-
     String localInterfaces[] =
       conf.getTrimmedStrings(DFSConfigKeys.DFS_CLIENT_LOCAL_INTERFACES);
     localInterfaceAddrs = getLocalInterfaceAddrs(localInterfaces);
@@ -584,7 +560,6 @@ public class DFSClient implements java.io.Closeable {
       Joiner.on(',').join(localInterfaceAddrs) + "]");
     }
     
-    this.peerCache = PeerCache.getInstance(dfsClientConf.socketCacheCapacity, dfsClientConf.socketCacheExpiry);
     Boolean readDropBehind = (conf.get(DFS_CLIENT_CACHE_DROP_BEHIND_READS) == null) ?
         null : conf.getBoolean(DFS_CLIENT_CACHE_DROP_BEHIND_READS, false);
     Long readahead = (conf.get(DFS_CLIENT_CACHE_READAHEAD) == null) ?
@@ -595,7 +570,9 @@ public class DFSClient implements java.io.Closeable {
         new CachingStrategy(readDropBehind, readahead);
     this.defaultWriteCachingStrategy =
         new CachingStrategy(writeDropBehind, readahead);
-    this.mmapManager = MMAP_MANAGER_FACTORY.get(conf);
+    this.clientContext = ClientContext.get(
+        conf.get(DFS_CLIENT_CONTEXT, DFS_CLIENT_CONTEXT_DEFAULT),
+        dfsClientConf);
   }
   
   /**
@@ -800,10 +777,6 @@ public class DFSClient implements java.io.Closeable {
   
   /** Abort and release resources held.  Ignore all errors. */
   void abort() {
-    if (mmapManager != null) {
-      MMAP_MANAGER_FACTORY.unref(mmapManager);
-      mmapManager = null;
-    }
     clientRunning = false;
     closeAllFilesBeingWritten(true);
     try {
@@ -849,10 +822,6 @@ public class DFSClient implements java.io.Closeable {
    */
   @Override
   public synchronized void close() throws IOException {
-    if (mmapManager != null) {
-      MMAP_MANAGER_FACTORY.unref(mmapManager);
-      mmapManager = null;
-    }
     if(clientRunning) {
       closeAllFilesBeingWritten(false);
       clientRunning = false;
@@ -2626,18 +2595,6 @@ public class DFSClient implements java.io.Closeable {
         + ", ugi=" + ugi + "]"; 
   }
 
-  public DomainSocketFactory getDomainSocketFactory() {
-    return domainSocketFactory;
-  }
-
-  public void disableLegacyBlockReaderLocal() {
-    shouldUseLegacyBlockReaderLocal = false;
-  }
-
-  public boolean useLegacyBlockReaderLocal() {
-    return shouldUseLegacyBlockReaderLocal;
-  }
-
   public CachingStrategy getDefaultReadCachingStrategy() {
     return defaultReadCachingStrategy;
   }
@@ -2646,8 +2603,29 @@ public class DFSClient implements java.io.Closeable {
     return defaultWriteCachingStrategy;
   }
 
-  @VisibleForTesting
-  public ClientMmapManager getMmapManager() {
-    return mmapManager;
+  public ClientContext getClientContext() {
+    return clientContext;
+  }
+
+  @Override // RemotePeerFactory
+  public Peer newConnectedPeer(InetSocketAddress addr) throws IOException {
+    Peer peer = null;
+    boolean success = false;
+    Socket sock = null;
+    try {
+      sock = socketFactory.createSocket();
+      NetUtils.connect(sock, addr,
+        getRandomLocalInterfaceAddr(),
+        dfsClientConf.socketTimeout);
+      peer = TcpPeerServer.peerFromSocketAndKey(sock, 
+          getDataEncryptionKey());
+      success = true;
+      return peer;
+    } finally {
+      if (!success) {
+        IOUtils.cleanup(LOG, peer);
+        IOUtils.closeSocket(sock);
+      }
+    }
   }
 }

+ 10 - 6
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java

@@ -59,6 +59,8 @@ public class DFSConfigKeys extends CommonConfigurationKeys {
   public static final String  DFS_CLIENT_CACHE_DROP_BEHIND_WRITES = "dfs.client.cache.drop.behind.writes";
   public static final String  DFS_CLIENT_CACHE_DROP_BEHIND_READS = "dfs.client.cache.drop.behind.reads";
   public static final String  DFS_CLIENT_CACHE_READAHEAD = "dfs.client.cache.readahead";
+  public static final String  DFS_CLIENT_CONTEXT = "dfs.client.context";
+  public static final String  DFS_CLIENT_CONTEXT_DEFAULT = "default";
   public static final String  DFS_HDFS_BLOCKS_METADATA_ENABLED = "dfs.datanode.hdfs-blocks-metadata.enabled";
   public static final boolean DFS_HDFS_BLOCKS_METADATA_ENABLED_DEFAULT = false;
   public static final String  DFS_CLIENT_FILE_BLOCK_STORAGE_LOCATIONS_NUM_THREADS = "dfs.client.file-block-storage-locations.num-threads";
@@ -418,18 +420,20 @@ public class DFSConfigKeys extends CommonConfigurationKeys {
   public static final boolean DFS_CLIENT_READ_SHORTCIRCUIT_SKIP_CHECKSUM_DEFAULT = false;
   public static final String DFS_CLIENT_READ_SHORTCIRCUIT_BUFFER_SIZE_KEY = "dfs.client.read.shortcircuit.buffer.size";
   public static final String DFS_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_SIZE_KEY = "dfs.client.read.shortcircuit.streams.cache.size";
-  public static final int DFS_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_SIZE_DEFAULT = 100;
+  public static final int DFS_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_SIZE_DEFAULT = 256;
   public static final String DFS_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_EXPIRY_MS_KEY = "dfs.client.read.shortcircuit.streams.cache.expiry.ms";
-  public static final long DFS_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_EXPIRY_MS_DEFAULT = 5000;
+  public static final long DFS_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_EXPIRY_MS_DEFAULT = 5 * 60 * 1000;
   public static final int DFS_CLIENT_READ_SHORTCIRCUIT_BUFFER_SIZE_DEFAULT = 1024 * 1024;
   public static final String DFS_CLIENT_DOMAIN_SOCKET_DATA_TRAFFIC = "dfs.client.domain.socket.data.traffic";
   public static final boolean DFS_CLIENT_DOMAIN_SOCKET_DATA_TRAFFIC_DEFAULT = false;
   public static final String DFS_CLIENT_MMAP_CACHE_SIZE = "dfs.client.mmap.cache.size";
-  public static final int DFS_CLIENT_MMAP_CACHE_SIZE_DEFAULT = 1024;
+  public static final int DFS_CLIENT_MMAP_CACHE_SIZE_DEFAULT = 256;
   public static final String DFS_CLIENT_MMAP_CACHE_TIMEOUT_MS = "dfs.client.mmap.cache.timeout.ms";
-  public static final long DFS_CLIENT_MMAP_CACHE_TIMEOUT_MS_DEFAULT  = 15 * 60 * 1000;
-  public static final String DFS_CLIENT_MMAP_CACHE_THREAD_RUNS_PER_TIMEOUT = "dfs.client.mmap.cache.thread.runs.per.timeout";
-  public static final int DFS_CLIENT_MMAP_CACHE_THREAD_RUNS_PER_TIMEOUT_DEFAULT  = 4;
+  public static final long DFS_CLIENT_MMAP_CACHE_TIMEOUT_MS_DEFAULT  = 60 * 60 * 1000;
+  public static final String DFS_CLIENT_MMAP_RETRY_TIMEOUT_MS = "dfs.client.mmap.retry.timeout.ms";
+  public static final long DFS_CLIENT_MMAP_RETRY_TIMEOUT_MS_DEFAULT = 5 * 60 * 1000;
+  public static final String DFS_CLIENT_SHORT_CIRCUIT_REPLICA_STALE_THRESHOLD_MS = "dfs.client.short.circuit.replica.stale.threshold.ms";
+  public static final long DFS_CLIENT_SHORT_CIRCUIT_REPLICA_STALE_THRESHOLD_MS_DEFAULT = 30 * 60 * 1000;
 
   // property for fsimage compression
   public static final String DFS_IMAGE_COMPRESS_KEY = "dfs.image.compress";

+ 56 - 227
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSInputStream.java

@@ -46,9 +46,6 @@ import org.apache.hadoop.fs.HasEnhancedByteBufferAccess;
 import org.apache.hadoop.fs.ReadOption;
 import org.apache.hadoop.fs.UnresolvedLinkException;
 import org.apache.hadoop.hdfs.client.ClientMmap;
-import org.apache.hadoop.hdfs.net.DomainPeer;
-import org.apache.hadoop.hdfs.net.Peer;
-import org.apache.hadoop.hdfs.net.TcpPeerServer;
 import org.apache.hadoop.hdfs.protocol.ClientDatanodeProtocol;
 import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
 import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
@@ -82,7 +79,6 @@ implements ByteBufferReadable, CanSetDropBehind, CanSetReadahead,
     HasEnhancedByteBufferAccess {
   @VisibleForTesting
   static boolean tcpReadsDisabledForTesting = false;
-  private final PeerCache peerCache;
   private final DFSClient dfsClient;
   private boolean closed = false;
   private final String src;
@@ -190,8 +186,6 @@ implements ByteBufferReadable, CanSetDropBehind, CanSetReadahead,
     private long totalZeroCopyBytesRead;
   }
   
-  private final FileInputStreamCache fileInputStreamCache;
-
   /**
    * This variable tracks the number of failures since the start of the
    * most recent user-facing operation. That is to say, it should be reset
@@ -223,10 +217,6 @@ implements ByteBufferReadable, CanSetDropBehind, CanSetReadahead,
     this.verifyChecksum = verifyChecksum;
     this.buffersize = buffersize;
     this.src = src;
-    this.peerCache = dfsClient.peerCache;
-    this.fileInputStreamCache = new FileInputStreamCache(
-        dfsClient.getConf().shortCircuitStreamsCacheSize,
-        dfsClient.getConf().shortCircuitStreamsCacheExpiryMs);
     this.cachingStrategy =
         dfsClient.getDefaultReadCachingStrategy();
     openInfo();
@@ -572,18 +562,28 @@ implements ByteBufferReadable, CanSetDropBehind, CanSetReadahead,
       try {
         ExtendedBlock blk = targetBlock.getBlock();
         Token<BlockTokenIdentifier> accessToken = targetBlock.getBlockToken();
-        blockReader = getBlockReader(targetAddr, chosenNode, src, blk,
-            accessToken, offsetIntoBlock, blk.getNumBytes() - offsetIntoBlock,
-            buffersize, verifyChecksum, dfsClient.clientName, cachingStrategy);
+        blockReader = new BlockReaderFactory(dfsClient.getConf()).
+            setInetSocketAddress(targetAddr).
+            setRemotePeerFactory(dfsClient).
+            setDatanodeInfo(chosenNode).
+            setFileName(src).
+            setBlock(blk).
+            setBlockToken(accessToken).
+            setStartOffset(offsetIntoBlock).
+            setVerifyChecksum(verifyChecksum).
+            setClientName(dfsClient.clientName).
+            setLength(blk.getNumBytes() - offsetIntoBlock).
+            setCachingStrategy(cachingStrategy).
+            setAllowShortCircuitLocalReads(!shortCircuitForbidden()).
+            setClientCacheContext(dfsClient.getClientContext()).
+            setUserGroupInformation(dfsClient.ugi).
+            setConfiguration(dfsClient.getConfiguration()).
+            build();
         if(connectFailedOnce) {
           DFSClient.LOG.info("Successfully connected to " + targetAddr +
                              " for " + blk);
         }
         return chosenNode;
-      } catch (AccessControlException ex) {
-        DFSClient.LOG.warn("Short circuit access failed " + ex);
-        dfsClient.disableLegacyBlockReaderLocal();
-        continue;
       } catch (IOException ex) {
         if (ex instanceof InvalidEncryptionKeyException && refetchEncryptionKey > 0) {
           DFSClient.LOG.info("Will fetch a new encryption key and retry, " 
@@ -635,7 +635,6 @@ implements ByteBufferReadable, CanSetDropBehind, CanSetReadahead,
       blockReader = null;
     }
     super.close();
-    fileInputStreamCache.close();
     closed = true;
   }
 
@@ -932,9 +931,11 @@ implements ByteBufferReadable, CanSetDropBehind, CanSetReadahead,
       // or fetchBlockAt(). Always get the latest list of locations at the 
       // start of the loop.
       CachingStrategy curCachingStrategy;
+      boolean allowShortCircuitLocalReads;
       synchronized (this) {
         block = getBlockAt(block.getStartOffset(), false);
         curCachingStrategy = cachingStrategy;
+        allowShortCircuitLocalReads = !shortCircuitForbidden();
       }
       DNAddrPair retval = chooseDataNode(block);
       DatanodeInfo chosenNode = retval.info;
@@ -943,11 +944,24 @@ implements ByteBufferReadable, CanSetDropBehind, CanSetReadahead,
           
       try {
         Token<BlockTokenIdentifier> blockToken = block.getBlockToken();
-            
         int len = (int) (end - start + 1);
-        reader = getBlockReader(targetAddr, chosenNode, src, block.getBlock(),
-            blockToken, start, len, buffersize, verifyChecksum,
-            dfsClient.clientName, curCachingStrategy);
+        reader = new BlockReaderFactory(dfsClient.getConf()).
+            setInetSocketAddress(targetAddr).
+            setRemotePeerFactory(dfsClient).
+            setDatanodeInfo(chosenNode).
+            setFileName(src).
+            setBlock(block.getBlock()).
+            setBlockToken(blockToken).
+            setStartOffset(start).
+            setVerifyChecksum(verifyChecksum).
+            setClientName(dfsClient.clientName).
+            setLength(len).
+            setCachingStrategy(curCachingStrategy).
+            setAllowShortCircuitLocalReads(allowShortCircuitLocalReads).
+            setClientCacheContext(dfsClient.getClientContext()).
+            setUserGroupInformation(dfsClient.ugi).
+            setConfiguration(dfsClient.getConfiguration()).
+            build();
         int nread = reader.readAll(buf, offset, len);
         if (nread != len) {
           throw new IOException("truncated return from reader.read(): " +
@@ -960,10 +974,6 @@ implements ByteBufferReadable, CanSetDropBehind, CanSetReadahead,
                  e.getPos() + " from " + chosenNode);
         // we want to remember what we have tried
         addIntoCorruptedBlockMap(block.getBlock(), chosenNode, corruptedBlockMap);
-      } catch (AccessControlException ex) {
-        DFSClient.LOG.warn("Short circuit access failed " + ex);
-        dfsClient.disableLegacyBlockReaderLocal();
-        continue;
       } catch (IOException e) {
         if (e instanceof InvalidEncryptionKeyException && refetchEncryptionKey > 0) {
           DFSClient.LOG.info("Will fetch a new encryption key and retry, " 
@@ -1022,194 +1032,6 @@ implements ByteBufferReadable, CanSetDropBehind, CanSetReadahead,
     return false;
   }
 
-  private Peer newTcpPeer(InetSocketAddress addr) throws IOException {
-    Peer peer = null;
-    boolean success = false;
-    Socket sock = null;
-    try {
-      sock = dfsClient.socketFactory.createSocket();
-      NetUtils.connect(sock, addr,
-        dfsClient.getRandomLocalInterfaceAddr(),
-        dfsClient.getConf().socketTimeout);
-      peer = TcpPeerServer.peerFromSocketAndKey(sock, 
-          dfsClient.getDataEncryptionKey());
-      success = true;
-      return peer;
-    } finally {
-      if (!success) {
-        IOUtils.closeQuietly(peer);
-        IOUtils.closeQuietly(sock);
-      }
-    }
-  }
-
-  /**
-   * Retrieve a BlockReader suitable for reading.
-   * This method will reuse the cached connection to the DN if appropriate.
-   * Otherwise, it will create a new connection.
-   * Throwing an IOException from this method is basically equivalent to 
-   * declaring the DataNode bad, so we try to connect a lot of different ways
-   * before doing that.
-   *
-   * @param dnAddr  Address of the datanode
-   * @param chosenNode Chosen datanode information
-   * @param file  File location
-   * @param block  The Block object
-   * @param blockToken  The access token for security
-   * @param startOffset  The read offset, relative to block head
-   * @param len  The number of bytes to read
-   * @param bufferSize  The IO buffer size (not the client buffer size)
-   * @param verifyChecksum  Whether to verify checksum
-   * @param clientName  Client name
-   * @param CachingStrategy  caching strategy to use
-   * @return New BlockReader instance
-   */
-  protected BlockReader getBlockReader(InetSocketAddress dnAddr,
-                                       DatanodeInfo chosenNode,
-                                       String file,
-                                       ExtendedBlock block,
-                                       Token<BlockTokenIdentifier> blockToken,
-                                       long startOffset,
-                                       long len,
-                                       int bufferSize,
-                                       boolean verifyChecksum,
-                                       String clientName,
-                                       CachingStrategy curCachingStrategy)
-      throws IOException {
-    // Firstly, we check to see if we have cached any file descriptors for
-    // local blocks.  If so, we can just re-use those file descriptors.
-    FileInputStream fis[] = fileInputStreamCache.get(chosenNode, block);
-    if (fis != null) {
-      if (DFSClient.LOG.isDebugEnabled()) {
-        DFSClient.LOG.debug("got FileInputStreams for " + block + " from " +
-            "the FileInputStreamCache.");
-      }
-      return new BlockReaderLocal.Builder(dfsClient.getConf()).
-          setFilename(file).
-          setBlock(block).
-          setStartOffset(startOffset).
-          setStreams(fis).
-          setDatanodeID(chosenNode).
-          setVerifyChecksum(verifyChecksum).
-          setBlockMetadataHeader(BlockMetadataHeader.
-              preadHeader(fis[1].getChannel())).
-          setFileInputStreamCache(fileInputStreamCache).
-          setCachingStrategy(curCachingStrategy).
-          build();
-    }
-    
-    // If the legacy local block reader is enabled and we are reading a local
-    // block, try to create a BlockReaderLocalLegacy.  The legacy local block
-    // reader implements local reads in the style first introduced by HDFS-2246.
-    if ((dfsClient.useLegacyBlockReaderLocal()) &&
-        DFSClient.isLocalAddress(dnAddr) &&
-        (!shortCircuitForbidden())) {
-      try {
-        return BlockReaderFactory.getLegacyBlockReaderLocal(dfsClient,
-            clientName, block, blockToken, chosenNode, startOffset);
-      } catch (IOException e) {
-        DFSClient.LOG.warn("error creating legacy BlockReaderLocal.  " +
-            "Disabling legacy local reads.", e);
-        dfsClient.disableLegacyBlockReaderLocal();
-      }
-    }
-
-    // Look for cached domain peers.
-    int cacheTries = 0;
-    DomainSocketFactory dsFactory = dfsClient.getDomainSocketFactory();
-    BlockReader reader = null;
-    final int nCachedConnRetry = dfsClient.getConf().nCachedConnRetry;
-    for (; cacheTries < nCachedConnRetry; ++cacheTries) {
-      Peer peer = peerCache.get(chosenNode, true);
-      if (peer == null) break;
-      try {
-        boolean allowShortCircuitLocalReads = dfsClient.getConf().
-            shortCircuitLocalReads && (!shortCircuitForbidden());
-        reader = BlockReaderFactory.newBlockReader(
-            dfsClient.getConf(), file, block, blockToken, startOffset,
-            len, verifyChecksum, clientName, peer, chosenNode, 
-            dsFactory, peerCache, fileInputStreamCache,
-            allowShortCircuitLocalReads, curCachingStrategy);
-        return reader;
-      } catch (IOException ex) {
-        DFSClient.LOG.debug("Error making BlockReader with DomainSocket. " +
-            "Closing stale " + peer, ex);
-      } finally {
-        if (reader == null) {
-          IOUtils.closeQuietly(peer);
-        }
-      }
-    }
-
-    // Try to create a DomainPeer.
-    DomainSocket domSock = dsFactory.create(dnAddr, this);
-    if (domSock != null) {
-      Peer peer = new DomainPeer(domSock);
-      try {
-        boolean allowShortCircuitLocalReads = dfsClient.getConf().
-            shortCircuitLocalReads && (!shortCircuitForbidden());
-        reader = BlockReaderFactory.newBlockReader(
-            dfsClient.getConf(), file, block, blockToken, startOffset,
-            len, verifyChecksum, clientName, peer, chosenNode,
-            dsFactory, peerCache, fileInputStreamCache,
-            allowShortCircuitLocalReads, curCachingStrategy);
-        return reader;
-      } catch (IOException e) {
-        DFSClient.LOG.warn("failed to connect to " + domSock, e);
-      } finally {
-        if (reader == null) {
-         // If the Peer that we got the error from was a DomainPeer,
-         // mark the socket path as bad, so that newDataSocket will not try 
-         // to re-open this socket for a while.
-         dsFactory.disableDomainSocketPath(domSock.getPath());
-         IOUtils.closeQuietly(peer);
-        }
-      }
-    }
-
-    // Look for cached peers.
-    for (; cacheTries < nCachedConnRetry; ++cacheTries) {
-      Peer peer = peerCache.get(chosenNode, false);
-      if (peer == null) break;
-      try {
-        reader = BlockReaderFactory.newBlockReader(
-            dfsClient.getConf(), file, block, blockToken, startOffset,
-            len, verifyChecksum, clientName, peer, chosenNode, 
-            dsFactory, peerCache, fileInputStreamCache, false,
-            curCachingStrategy);
-        return reader;
-      } catch (IOException ex) {
-        DFSClient.LOG.debug("Error making BlockReader. Closing stale " +
-          peer, ex);
-      } finally {
-        if (reader == null) {
-          IOUtils.closeQuietly(peer);
-        }
-      }
-    }
-    if (tcpReadsDisabledForTesting) {
-      throw new IOException("TCP reads are disabled.");
-    }
-    // Try to create a new remote peer.
-    Peer peer = newTcpPeer(dnAddr);
-    try {
-      reader = BlockReaderFactory.newBlockReader(dfsClient.getConf(), file,
-          block, blockToken, startOffset, len, verifyChecksum, clientName,
-          peer, chosenNode, dsFactory, peerCache, fileInputStreamCache, false,
-          curCachingStrategy);
-      return reader;
-    } catch (IOException ex) {
-      DFSClient.LOG.debug(
-          "Exception while getting block reader, closing stale " + peer, ex);
-      throw ex;
-    } finally {
-      if (reader == null) {
-        IOUtils.closeQuietly(peer);
-      }
-    }
-  }
-
-
   /**
    * Read bytes starting from the specified position.
    * 
@@ -1555,8 +1377,7 @@ implements ByteBufferReadable, CanSetDropBehind, CanSetReadahead,
     long blockStartInFile = currentLocatedBlock.getStartOffset();
     long blockPos = curPos - blockStartInFile;
     long limit = blockPos + length;
-    ClientMmap clientMmap =
-        blockReader.getClientMmap(opts, dfsClient.getMmapManager());
+    ClientMmap clientMmap = blockReader.getClientMmap(opts);
     if (clientMmap == null) {
       if (DFSClient.LOG.isDebugEnabled()) {
         DFSClient.LOG.debug("unable to perform a zero-copy read from offset " +
@@ -1565,17 +1386,25 @@ implements ByteBufferReadable, CanSetDropBehind, CanSetReadahead,
       }
       return null;
     }
-    seek(pos + length);
-    ByteBuffer buffer = clientMmap.getMappedByteBuffer().asReadOnlyBuffer();
-    buffer.position((int)blockPos);
-    buffer.limit((int)limit);
-    clientMmap.ref();
-    extendedReadBuffers.put(buffer, clientMmap);
-    readStatistics.addZeroCopyBytes(length);
-    if (DFSClient.LOG.isDebugEnabled()) {
-      DFSClient.LOG.debug("readZeroCopy read " + maxLength + " bytes from " +
-          "offset " + curPos + " via the zero-copy read path.  " +
-          "blockEnd = " + blockEnd);
+    boolean success = false;
+    ByteBuffer buffer;
+    try {
+      seek(pos + length);
+      buffer = clientMmap.getMappedByteBuffer().asReadOnlyBuffer();
+      buffer.position((int)blockPos);
+      buffer.limit((int)limit);
+      extendedReadBuffers.put(buffer, clientMmap);
+      readStatistics.addZeroCopyBytes(length);
+      if (DFSClient.LOG.isDebugEnabled()) {
+        DFSClient.LOG.debug("readZeroCopy read " + maxLength + " bytes from " +
+            "offset " + curPos + " via the zero-copy read path.  " +
+            "blockEnd = " + blockEnd);
+      }
+      success = true;
+    } finally {
+      if (!success) {
+        clientMmap.unref();
+      }
     }
     return buffer;
   }

+ 76 - 39
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DomainSocketFactory.java

@@ -27,29 +27,71 @@ import org.apache.hadoop.HadoopIllegalArgumentException;
 import org.apache.hadoop.hdfs.DFSClient.Conf;
 import org.apache.hadoop.net.unix.DomainSocket;
 
+import com.google.common.base.Preconditions;
 import com.google.common.cache.Cache;
 import com.google.common.cache.CacheBuilder;
 
 class DomainSocketFactory {
   private static final Log LOG = BlockReaderLocal.LOG;
-  private final Conf conf;
 
-  enum PathStatus {
-    UNUSABLE,
-    SHORT_CIRCUIT_DISABLED,
+  public enum PathState {
+    UNUSABLE(false, false),
+    SHORT_CIRCUIT_DISABLED(true, false),
+    VALID(true, true);
+
+    PathState(boolean usableForDataTransfer, boolean usableForShortCircuit) {
+      this.usableForDataTransfer = usableForDataTransfer;
+      this.usableForShortCircuit = usableForShortCircuit;
+    }
+
+    public boolean getUsableForDataTransfer() {
+      return usableForDataTransfer;
+    }
+
+    public boolean getUsableForShortCircuit() {
+      return usableForShortCircuit;
+    }
+
+    private final boolean usableForDataTransfer;
+    private final boolean usableForShortCircuit;
+  }
+
+  public static class PathInfo {
+    private final static PathInfo NOT_CONFIGURED =
+          new PathInfo("", PathState.UNUSABLE);
+
+    final private String path;
+    final private PathState state;
+
+    PathInfo(String path, PathState state) {
+      this.path = path;
+      this.state = state;
+    }
+
+    public String getPath() {
+      return path;
+    }
+
+    public PathState getPathState() {
+      return state;
+    }
+    
+    @Override
+    public String toString() {
+      return new StringBuilder().append("PathInfo{path=").append(path).
+          append(", state=").append(state).append("}").toString();
+    }
   }
 
   /**
    * Information about domain socket paths.
    */
-  Cache<String, PathStatus> pathInfo =
+  Cache<String, PathState> pathMap =
       CacheBuilder.newBuilder()
       .expireAfterWrite(10, TimeUnit.MINUTES)
       .build();
 
   public DomainSocketFactory(Conf conf) {
-    this.conf = conf;
-
     final String feature;
     if (conf.shortCircuitLocalReads && (!conf.useLegacyBlockReaderLocal)) {
       feature = "The short-circuit local reads feature";
@@ -75,51 +117,46 @@ class DomainSocketFactory {
   }
 
   /**
-   * Create a DomainSocket.
-   * 
-   * @param addr        The address of the DataNode
-   * @param stream      The DFSInputStream the socket will be created for.
+   * Get information about a domain socket path.
+   *
+   * @param addr         The inet address to use.
+   * @param conf         The client configuration.
    *
-   * @return            null if the socket could not be created; the
-   *                    socket otherwise.  If there was an error while
-   *                    creating the socket, we will add the socket path
-   *                    to our list of failed domain socket paths.
+   * @return             Information about the socket path.
    */
-  DomainSocket create(InetSocketAddress addr, DFSInputStream stream) {
+  public PathInfo getPathInfo(InetSocketAddress addr, DFSClient.Conf conf) {
     // If there is no domain socket path configured, we can't use domain
     // sockets.
-    if (conf.domainSocketPath.isEmpty()) return null;
+    if (conf.domainSocketPath.isEmpty()) return PathInfo.NOT_CONFIGURED;
     // If we can't do anything with the domain socket, don't create it.
     if (!conf.domainSocketDataTraffic &&
         (!conf.shortCircuitLocalReads || conf.useLegacyBlockReaderLocal)) {
-      return null;
+      return PathInfo.NOT_CONFIGURED;
     }
-    // UNIX domain sockets can only be used to talk to local peers
-    if (!DFSClient.isLocalAddress(addr)) return null;
     // If the DomainSocket code is not loaded, we can't create
     // DomainSocket objects.
-    if (DomainSocket.getLoadingFailureReason() != null) return null;
+    if (DomainSocket.getLoadingFailureReason() != null) {
+      return PathInfo.NOT_CONFIGURED;
+    }
+    // UNIX domain sockets can only be used to talk to local peers
+    if (!DFSClient.isLocalAddress(addr)) return PathInfo.NOT_CONFIGURED;
     String escapedPath = DomainSocket.
         getEffectivePath(conf.domainSocketPath, addr.getPort());
-    PathStatus info = pathInfo.getIfPresent(escapedPath);
-    if (info == PathStatus.UNUSABLE) {
-      // We tried to connect to this domain socket before, and it was totally
-      // unusable.
-      return null;
-    }
-    if ((!conf.domainSocketDataTraffic) &&
-        ((info == PathStatus.SHORT_CIRCUIT_DISABLED) || 
-            stream.shortCircuitForbidden())) {
-      // If we don't want to pass data over domain sockets, and we don't want
-      // to pass file descriptors over them either, we have no use for domain
-      // sockets.
-      return null;
+    PathState status = pathMap.getIfPresent(escapedPath);
+    if (status == null) {
+      return new PathInfo(escapedPath, PathState.VALID);
+    } else {
+      return new PathInfo(escapedPath, status);
     }
+  }
+
+  public DomainSocket createSocket(PathInfo info, int socketTimeout) {
+    Preconditions.checkArgument(info.getPathState() != PathState.UNUSABLE);
     boolean success = false;
     DomainSocket sock = null;
     try {
-      sock = DomainSocket.connect(escapedPath);
-      sock.setAttribute(DomainSocket.RECEIVE_TIMEOUT, conf.socketTimeout);
+      sock = DomainSocket.connect(info.getPath());
+      sock.setAttribute(DomainSocket.RECEIVE_TIMEOUT, socketTimeout);
       success = true;
     } catch (IOException e) {
       LOG.warn("error creating DomainSocket", e);
@@ -129,7 +166,7 @@ class DomainSocketFactory {
         if (sock != null) {
           IOUtils.closeQuietly(sock);
         }
-        pathInfo.put(escapedPath, PathStatus.UNUSABLE);
+        pathMap.put(info.getPath(), PathState.UNUSABLE);
         sock = null;
       }
     }
@@ -137,10 +174,10 @@ class DomainSocketFactory {
   }
 
   public void disableShortCircuitForPath(String path) {
-    pathInfo.put(path, PathStatus.SHORT_CIRCUIT_DISABLED);
+    pathMap.put(path, PathState.SHORT_CIRCUIT_DISABLED);
   }
 
   public void disableDomainSocketPath(String path) {
-    pathInfo.put(path, PathStatus.UNUSABLE);
+    pathMap.put(path, PathState.UNUSABLE);
   }
 }

+ 75 - 0
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/ExtendedBlockId.java

@@ -0,0 +1,75 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs;
+
+import org.apache.commons.lang.builder.EqualsBuilder;
+import org.apache.commons.lang.builder.HashCodeBuilder;
+
+/**
+ * An immutable key which identifies a block.
+ */
+final public class ExtendedBlockId {
+  /**
+   * The block ID for this block.
+   */
+  private final long blockId;
+
+  /**
+   * The block pool ID for this block.
+   */
+  private final String bpId;
+
+  public ExtendedBlockId(long blockId, String bpId) {
+    this.blockId = blockId;
+    this.bpId = bpId;
+  }
+
+  public long getBlockId() {
+    return this.blockId;
+  }
+
+  public String getBlockPoolId() {
+    return this.bpId;
+  }
+
+  @Override
+  public boolean equals(Object o) {
+    if ((o == null) || (o.getClass() != this.getClass())) {
+      return false;
+    }
+    ExtendedBlockId other = (ExtendedBlockId)o;
+    return new EqualsBuilder().
+        append(blockId, other.blockId).
+        append(bpId, other.bpId).
+        isEquals();
+  }
+
+  @Override
+  public int hashCode() {
+    return new HashCodeBuilder().
+        append(this.blockId).
+        append(this.bpId).
+        toHashCode();
+  }
+
+  @Override
+  public String toString() {
+    return new StringBuilder().append(blockId).
+        append("_").append(bpId).toString();
+  }
+}

+ 0 - 287
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/FileInputStreamCache.java

@@ -1,287 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.hadoop.hdfs;
-
-import java.io.Closeable;
-import java.io.FileInputStream;
-import java.io.IOException;
-import java.lang.ref.WeakReference;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map.Entry;
-import java.util.concurrent.ScheduledFuture;
-import java.util.concurrent.ScheduledThreadPoolExecutor;
-import java.util.concurrent.TimeUnit;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.hadoop.hdfs.protocol.DatanodeID;
-import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
-import org.apache.hadoop.io.IOUtils;
-import org.apache.hadoop.util.Time;
-
-import com.google.common.collect.LinkedListMultimap;
-import com.google.common.util.concurrent.ThreadFactoryBuilder;
-
-/**
- * FileInputStream cache is used to cache FileInputStream objects that we
- * have received from the DataNode.
- */
-class FileInputStreamCache {
-  private final static Log LOG = LogFactory.getLog(FileInputStreamCache.class);
-
-  /**
-   * The executor service that runs the cacheCleaner.  There is only one of
-   * these per VM.
-   */
-  private final static ScheduledThreadPoolExecutor executor
-      = new ScheduledThreadPoolExecutor(1, new ThreadFactoryBuilder().
-          setDaemon(true).setNameFormat("FileInputStreamCache Cleaner").
-          build());
-  
-  /**
-   * The CacheCleaner for this FileInputStreamCache.  We don't create this
-   * and schedule it until it becomes necessary.
-   */
-  private CacheCleaner cacheCleaner;
-  
-  /**
-   * Maximum number of entries to allow in the cache.
-   */
-  private final int maxCacheSize;
-  
-  /**
-   * The minimum time in milliseconds to preserve an element in the cache.
-   */
-  private final long expiryTimeMs;
-  
-  /**
-   * True if the FileInputStreamCache is closed.
-   */
-  private boolean closed = false;
-  
-  /**
-   * Cache entries.
-   */
-  private final LinkedListMultimap<Key, Value> map = LinkedListMultimap.create();
-
-  /**
-   * Expiry thread which makes sure that the file descriptors get closed
-   * after a while.
-   */
-  private static class CacheCleaner implements Runnable, Closeable {
-    private WeakReference<FileInputStreamCache> cacheRef;
-    private ScheduledFuture<?> future;
-    
-    CacheCleaner(FileInputStreamCache cache) {
-      this.cacheRef = new WeakReference<FileInputStreamCache>(cache);
-    }
-    
-    @Override
-    public void run() {
-      FileInputStreamCache cache = cacheRef.get();
-      if (cache == null) return;
-      synchronized(cache) {
-        if (cache.closed) return;
-        long curTime = Time.monotonicNow();
-        for (Iterator<Entry<Key, Value>> iter =
-                  cache.map.entries().iterator(); iter.hasNext();
-              iter = cache.map.entries().iterator()) {
-          Entry<Key, Value> entry = iter.next();
-          if (entry.getValue().getTime() + cache.expiryTimeMs >= curTime) {
-            break;
-          }
-          entry.getValue().close();
-          iter.remove();
-        }
-      }
-    }
-
-    @Override
-    public void close() throws IOException {
-      if (future != null) {
-        future.cancel(false);
-      }
-    }
-    
-    public void setFuture(ScheduledFuture<?> future) {
-      this.future = future;
-    }
-  }
-
-  /**
-   * The key identifying a FileInputStream array.
-   */
-  static class Key {
-    private final DatanodeID datanodeID;
-    private final ExtendedBlock block;
-    
-    public Key(DatanodeID datanodeID, ExtendedBlock block) {
-      this.datanodeID = datanodeID;
-      this.block = block;
-    }
-    
-    @Override
-    public boolean equals(Object other) {
-      if (!(other instanceof FileInputStreamCache.Key)) {
-        return false;
-      }
-      FileInputStreamCache.Key otherKey = (FileInputStreamCache.Key)other;
-      return (block.equals(otherKey.block) &&
-          (block.getGenerationStamp() == otherKey.block.getGenerationStamp()) &&
-          datanodeID.equals(otherKey.datanodeID));
-    }
-
-    @Override
-    public int hashCode() {
-      return block.hashCode();
-    }
-  }
-
-  /**
-   * The value containing a FileInputStream array and the time it was added to
-   * the cache.
-   */
-  static class Value {
-    private final FileInputStream fis[];
-    private final long time;
-    
-    public Value (FileInputStream fis[]) {
-      this.fis = fis;
-      this.time = Time.monotonicNow();
-    }
-
-    public FileInputStream[] getFileInputStreams() {
-      return fis;
-    }
-
-    public long getTime() {
-      return time;
-    }
-    
-    public void close() {
-      IOUtils.cleanup(LOG, fis);
-    }
-  }
-  
-  /**
-   * Create a new FileInputStream
-   *
-   * @param maxCacheSize         The maximum number of elements to allow in 
-   *                             the cache.
-   * @param expiryTimeMs         The minimum time in milliseconds to preserve
-   *                             elements in the cache.
-   */
-  public FileInputStreamCache(int maxCacheSize, long expiryTimeMs) {
-    this.maxCacheSize = maxCacheSize;
-    this.expiryTimeMs = expiryTimeMs;
-  }
-  
-  /**
-   * Put an array of FileInputStream objects into the cache.
-   *
-   * @param datanodeID          The DatanodeID to store the streams under.
-   * @param block               The Block to store the streams under.
-   * @param fis                 The streams.
-   */
-  public void put(DatanodeID datanodeID, ExtendedBlock block,
-      FileInputStream fis[]) {
-    boolean inserted = false;
-    try {
-      synchronized(this) {
-        if (closed) return;
-        if (map.size() + 1 > maxCacheSize) {
-          Iterator<Entry<Key, Value>> iter = map.entries().iterator();
-          if (!iter.hasNext()) return;
-          Entry<Key, Value> entry = iter.next();
-          entry.getValue().close();
-          iter.remove();
-        }
-        if (cacheCleaner == null) {
-          cacheCleaner = new CacheCleaner(this);
-          ScheduledFuture<?> future = 
-              executor.scheduleAtFixedRate(cacheCleaner, expiryTimeMs, expiryTimeMs,
-                  TimeUnit.MILLISECONDS);
-          cacheCleaner.setFuture(future);
-        }
-        map.put(new Key(datanodeID, block), new Value(fis));
-        inserted = true;
-      }
-    } finally {
-      if (!inserted) {
-        IOUtils.cleanup(LOG, fis);
-      }
-    }
-  }
-  
-  /**
-   * Find and remove an array of FileInputStream objects from the cache.
-   *
-   * @param datanodeID          The DatanodeID to search for.
-   * @param block               The Block to search for.
-   *
-   * @return                    null if no streams can be found; the
-   *                            array otherwise.  If this is non-null, the
-   *                            array will have been removed from the cache.
-   */
-  public synchronized FileInputStream[] get(DatanodeID datanodeID,
-      ExtendedBlock block) {
-    Key key = new Key(datanodeID, block);
-    List<Value> ret = map.get(key);
-    if (ret.isEmpty()) return null;
-    Value val = ret.get(0);
-    map.remove(key, val);
-    return val.getFileInputStreams();
-  }
-  
-  /**
-   * Close the cache and free all associated resources.
-   */
-  public synchronized void close() {
-    if (closed) return;
-    closed = true;
-    IOUtils.cleanup(LOG, cacheCleaner);
-    for (Iterator<Entry<Key, Value>> iter = map.entries().iterator();
-          iter.hasNext();) {
-      Entry<Key, Value> entry = iter.next();
-      entry.getValue().close();
-      iter.remove();
-    }
-  }
-  
-  public synchronized String toString() {
-    StringBuilder bld = new StringBuilder();
-    bld.append("FileInputStreamCache(");
-    String prefix = "";
-    for (Entry<Key, Value> entry : map.entries()) {
-      bld.append(prefix);
-      bld.append(entry.getKey());
-      prefix = ", ";
-    }
-    bld.append(")");
-    return bld.toString();
-  }
-  
-  public long getExpiryTimeMs() {
-    return expiryTimeMs;
-  }
-  
-  public int getMaxCacheSize() {
-    return maxCacheSize;
-  }
-}

+ 3 - 26
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/PeerCache.java

@@ -89,42 +89,19 @@ class PeerCache {
     LinkedListMultimap.create();
   private final int capacity;
   private final long expiryPeriod;
-  private static PeerCache instance = null;
   
-  @VisibleForTesting
-  PeerCache(int c, long e) {
+  public PeerCache(int c, long e) {
     this.capacity = c;
     this.expiryPeriod = e;
 
     if (capacity == 0 ) {
       LOG.info("SocketCache disabled.");
-    }
-    else if (expiryPeriod == 0) {
+    } else if (expiryPeriod == 0) {
       throw new IllegalStateException("Cannot initialize expiryPeriod to " +
-         expiryPeriod + "when cache is enabled.");
+         expiryPeriod + " when cache is enabled.");
     }
   }
  
-  public static synchronized PeerCache getInstance(int c, long e) {
-    // capacity is only initialized once
-    if (instance == null) {
-      instance = new PeerCache(c, e);
-    } else { //already initialized once
-      if (instance.capacity != c || instance.expiryPeriod != e) {
-        LOG.info("capacity and expiry periods already set to " +
-          instance.capacity + " and " + instance.expiryPeriod +
-          " respectively. Cannot set it to " + c + " and " + e);
-      }
-    }
-
-    return instance;
-  }
-
-  @VisibleForTesting
-  public static synchronized void setInstance(int c, long e) {
-    instance = new PeerCache(c, e);
-  }
-
   private boolean isDaemonStarted() {
     return (daemon == null)? false: true;
   }

+ 1 - 3
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/RemoteBlockReader.java

@@ -30,7 +30,6 @@ import org.apache.hadoop.fs.FSInputChecker;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.ReadOption;
 import org.apache.hadoop.hdfs.client.ClientMmap;
-import org.apache.hadoop.hdfs.client.ClientMmapManager;
 import org.apache.hadoop.hdfs.net.Peer;
 import org.apache.hadoop.hdfs.protocol.DatanodeID;
 import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
@@ -492,8 +491,7 @@ public class RemoteBlockReader extends FSInputChecker implements BlockReader {
   }
 
   @Override
-  public ClientMmap getClientMmap(EnumSet<ReadOption> opts,
-        ClientMmapManager mmapManager) {
+  public ClientMmap getClientMmap(EnumSet<ReadOption> opts) {
     return null;
   }
 }

+ 1 - 3
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/RemoteBlockReader2.java

@@ -32,7 +32,6 @@ import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.fs.ReadOption;
 import org.apache.hadoop.hdfs.client.ClientMmap;
-import org.apache.hadoop.hdfs.client.ClientMmapManager;
 import org.apache.hadoop.hdfs.net.Peer;
 import org.apache.hadoop.hdfs.protocol.DatanodeID;
 import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
@@ -457,8 +456,7 @@ public class RemoteBlockReader2  implements BlockReader {
   }
 
   @Override
-  public ClientMmap getClientMmap(EnumSet<ReadOption> opts,
-        ClientMmapManager mmapManager) {
+  public ClientMmap getClientMmap(EnumSet<ReadOption> opts) {
     return null;
   }
 }

+ 37 - 0
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/RemotePeerFactory.java

@@ -0,0 +1,37 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs;
+
+import java.io.IOException;
+import java.net.InetSocketAddress;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hdfs.net.Peer;
+import org.apache.hadoop.security.UserGroupInformation;
+
+public interface RemotePeerFactory {
+  /**
+   * @param addr          The address to connect to.
+   * 
+   * @return              A new Peer connected to the address.
+   *
+   * @throws IOException  If there was an error connecting or creating 
+   *                      the remote socket, encrypted stream, etc.
+   */
+  Peer newConnectedPeer(InetSocketAddress addr) throws IOException;
+}

+ 19 - 94
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/client/ClientMmap.java

@@ -17,24 +17,14 @@
  */
 package org.apache.hadoop.hdfs.client;
 
-import java.io.FileInputStream;
-
 import org.apache.hadoop.classification.InterfaceAudience;
-import org.apache.hadoop.hdfs.protocol.DatanodeID;
-import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
-import org.apache.hadoop.io.nativeio.NativeIO;
 
-import java.io.IOException;
-import java.lang.ref.WeakReference;
 import java.nio.MappedByteBuffer;
-import java.nio.channels.FileChannel.MapMode;
 import java.util.concurrent.atomic.AtomicInteger;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 
-import com.google.common.annotations.VisibleForTesting;
-
 /**
  * A memory-mapped region used by an HDFS client.
  * 
@@ -46,111 +36,46 @@ public class ClientMmap {
   static final Log LOG = LogFactory.getLog(ClientMmap.class);
   
   /**
-   * A reference to the manager of this mmap.
-   * 
-   * This is only a weak reference to help minimize the damange done by
-   * code which leaks references accidentally.
+   * A reference to the block replica which this mmap relates to.
    */
-  private final WeakReference<ClientMmapManager> manager;
+  private final ShortCircuitReplica replica;
   
   /**
-   * The actual mapped memory region.
+   * The java ByteBuffer object.
    */
   private final MappedByteBuffer map;
-  
-  /**
-   * A reference count tracking how many threads are using this object.
-   */
-  private final AtomicInteger refCount = new AtomicInteger(1);
 
   /**
-   * Block pertaining to this mmap
+   * Reference count of this ClientMmap object.
    */
-  private final ExtendedBlock block;
-  
-  /**
-   * The DataNode where this mmap came from.
-   */
-  private final DatanodeID datanodeID;
-
-  /**
-   * The monotonic time when this mmap was last evictable.
-   */
-  private long lastEvictableTimeNs;
-
-  public static ClientMmap load(ClientMmapManager manager, FileInputStream in, 
-      ExtendedBlock block, DatanodeID datanodeID) 
-          throws IOException {
-    MappedByteBuffer map =
-        in.getChannel().map(MapMode.READ_ONLY, 0,
-            in.getChannel().size());
-    return new ClientMmap(manager, map, block, datanodeID);
-  }
+  private final AtomicInteger refCount = new AtomicInteger(1);
 
-  private ClientMmap(ClientMmapManager manager, MappedByteBuffer map, 
-        ExtendedBlock block, DatanodeID datanodeID) 
-            throws IOException {
-    this.manager = new WeakReference<ClientMmapManager>(manager);
+  ClientMmap(ShortCircuitReplica replica, MappedByteBuffer map) {
+    this.replica = replica;
     this.map = map;
-    this.block = block;
-    this.datanodeID = datanodeID;
-    this.lastEvictableTimeNs = 0;
   }
 
   /**
-   * Decrement the reference count on this object.
-   * Should be called with the ClientMmapManager lock held.
+   * Increment the reference count.
+   *
+   * @return   The new reference count.
    */
-  public void unref() {
-    int count = refCount.decrementAndGet();
-    if (count < 0) {
-      throw new IllegalArgumentException("can't decrement the " +
-          "reference count on this ClientMmap lower than 0.");
-    } else if (count == 0) {
-      ClientMmapManager man = manager.get();
-      if (man == null) {
-        unmap();
-      } else {
-        man.makeEvictable(this);
-      }
-    }
+  void ref() {
+    refCount.addAndGet(1);
   }
 
   /**
-   * Increment the reference count on this object.
+   * Decrement the reference count.
    *
-   * @return     The new reference count.
+   * The parent replica gets unreferenced each time the reference count 
+   * of this object goes to 0.
    */
-  public int ref() {
-    return refCount.getAndIncrement();
-  }
-
-  @VisibleForTesting
-  public ExtendedBlock getBlock() {
-    return block;
-  }
-
-  DatanodeID getDatanodeID() {
-    return datanodeID;
+  public void unref() {
+    refCount.addAndGet(-1);
+    replica.unref();
   }
 
   public MappedByteBuffer getMappedByteBuffer() {
     return map;
   }
-
-  public void setLastEvictableTimeNs(long lastEvictableTimeNs) {
-    this.lastEvictableTimeNs = lastEvictableTimeNs;
-  }
-
-  public long getLastEvictableTimeNs() {
-    return this.lastEvictableTimeNs;
-  }
-
-  /**
-   * Unmap the memory region.
-   */
-  void unmap() {
-    assert(refCount.get() == 0);
-    NativeIO.POSIX.munmap(map);
-  }
-}
+}

+ 0 - 482
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/client/ClientMmapManager.java

@@ -1,482 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.hadoop.hdfs.client;
-
-import java.io.Closeable;
-
-import org.apache.hadoop.classification.InterfaceAudience;
-
-import java.io.FileInputStream;
-import java.io.IOException;
-import java.lang.ref.WeakReference;
-import java.util.Iterator;
-import java.util.TreeMap;
-import java.util.Map.Entry;
-import java.util.concurrent.ScheduledFuture;
-import java.util.concurrent.ScheduledThreadPoolExecutor;
-import java.util.concurrent.TimeUnit;
-import java.util.concurrent.locks.Condition;
-import java.util.concurrent.locks.Lock;
-import java.util.concurrent.locks.ReentrantLock;
-
-import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CLIENT_MMAP_CACHE_SIZE;
-import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CLIENT_MMAP_CACHE_SIZE_DEFAULT;
-import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CLIENT_MMAP_CACHE_TIMEOUT_MS;
-import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CLIENT_MMAP_CACHE_TIMEOUT_MS_DEFAULT;
-import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CLIENT_MMAP_CACHE_THREAD_RUNS_PER_TIMEOUT;
-import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CLIENT_MMAP_CACHE_THREAD_RUNS_PER_TIMEOUT_DEFAULT;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.hdfs.protocol.DatanodeID;
-import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
-import org.apache.hadoop.io.IOUtils;
-
-import com.google.common.annotations.VisibleForTesting;
-import com.google.common.collect.ComparisonChain;
-import com.google.common.util.concurrent.ThreadFactoryBuilder;
-
-/**
- * Tracks mmap instances used on an HDFS client.
- *
- * mmaps can be used concurrently by multiple threads at once.
- * mmaps cannot be closed while they are in use.
- *
- * The cache is important for performance, because the first time an mmap is
- * created, the page table entries (PTEs) are not yet set up.
- * Even when reading data that is entirely resident in memory, reading an
- * mmap the second time is faster.
- */
-@InterfaceAudience.Private
-public class ClientMmapManager implements Closeable {
-  public static final Log LOG = LogFactory.getLog(ClientMmapManager.class);
-
-  private boolean closed = false;
-
-  private final int cacheSize;
-
-  private final long timeoutNs;
-
-  private final int runsPerTimeout;
-
-  private final Lock lock = new ReentrantLock();
-  
-  /**
-   * Maps block, datanode_id to the client mmap object.
-   * If the ClientMmap is in the process of being loaded,
-   * {@link Waitable<ClientMmap>#await()} will block.
-   *
-   * Protected by the ClientMmapManager lock.
-   */
-  private final TreeMap<Key, Waitable<ClientMmap>> mmaps =
-      new TreeMap<Key, Waitable<ClientMmap>>();
-
-  /**
-   * Maps the last use time to the client mmap object.
-   * We ensure that each last use time is unique by inserting a jitter of a
-   * nanosecond or two if necessary.
-   * 
-   * Protected by the ClientMmapManager lock.
-   * ClientMmap objects that are in use are never evictable.
-   */
-  private final TreeMap<Long, ClientMmap> evictable =
-      new TreeMap<Long, ClientMmap>();
-
-  private final ScheduledThreadPoolExecutor executor = 
-      new ScheduledThreadPoolExecutor(1, new ThreadFactoryBuilder().
-          setDaemon(true).setNameFormat("ClientMmapManager").
-          build());
-  
-  /**
-   * The CacheCleaner for this ClientMmapManager.  We don't create this
-   * and schedule it until it becomes necessary.
-   */
-  private CacheCleaner cacheCleaner;
-
-  /**
-   * Factory method to create a ClientMmapManager from a Hadoop
-   * configuration.
-   */
-  public static ClientMmapManager fromConf(Configuration conf) {
-    return new ClientMmapManager(conf.getInt(DFS_CLIENT_MMAP_CACHE_SIZE,
-      DFS_CLIENT_MMAP_CACHE_SIZE_DEFAULT),
-      conf.getLong(DFS_CLIENT_MMAP_CACHE_TIMEOUT_MS,
-        DFS_CLIENT_MMAP_CACHE_TIMEOUT_MS_DEFAULT),
-      conf.getInt(DFS_CLIENT_MMAP_CACHE_THREAD_RUNS_PER_TIMEOUT,
-        DFS_CLIENT_MMAP_CACHE_THREAD_RUNS_PER_TIMEOUT_DEFAULT));
-  }
-
-  public ClientMmapManager(int cacheSize, long timeoutMs, int runsPerTimeout) {
-    this.cacheSize = cacheSize;
-    this.timeoutNs = timeoutMs * 1000000;
-    this.runsPerTimeout = runsPerTimeout;
-  }
-  
-  long getTimeoutMs() {
-    return this.timeoutNs / 1000000;
-  }
-
-  int getRunsPerTimeout() {
-    return this.runsPerTimeout;
-  }
-  
-  public String verifyConfigurationMatches(Configuration conf) {
-    StringBuilder bld = new StringBuilder();
-    int cacheSize = conf.getInt(DFS_CLIENT_MMAP_CACHE_SIZE,
-                    DFS_CLIENT_MMAP_CACHE_SIZE_DEFAULT);
-    if (this.cacheSize != cacheSize) {
-      bld.append("You specified a cache size of ").append(cacheSize).
-          append(", but the existing cache size is ").append(this.cacheSize).
-          append(".  ");
-    }
-    long timeoutMs = conf.getLong(DFS_CLIENT_MMAP_CACHE_TIMEOUT_MS,
-        DFS_CLIENT_MMAP_CACHE_TIMEOUT_MS_DEFAULT);
-    if (getTimeoutMs() != timeoutMs) {
-      bld.append("You specified a cache timeout of ").append(timeoutMs).
-          append(" ms, but the existing cache timeout is ").
-          append(getTimeoutMs()).append("ms").append(".  ");
-    }
-    int runsPerTimeout = conf.getInt(
-        DFS_CLIENT_MMAP_CACHE_THREAD_RUNS_PER_TIMEOUT,
-        DFS_CLIENT_MMAP_CACHE_THREAD_RUNS_PER_TIMEOUT_DEFAULT);
-    if (getRunsPerTimeout() != runsPerTimeout) {
-      bld.append("You specified ").append(runsPerTimeout).
-          append(" runs per timeout, but the existing runs per timeout is ").
-          append(getTimeoutMs()).append(".  ");
-    }
-    return bld.toString();
-  }
-
-  private static class Waitable<T> {
-    private T val;
-    private final Condition cond;
-
-    public Waitable(Condition cond) {
-      this.val = null;
-      this.cond = cond;
-    }
-
-    public T await() throws InterruptedException {
-      while (this.val == null) {
-        this.cond.await();
-      }
-      return this.val;
-    }
-
-    public void provide(T val) {
-      this.val = val;
-      this.cond.signalAll();
-    }
-  }
-
-  private static class Key implements Comparable<Key> {
-    private final ExtendedBlock block;
-    private final DatanodeID datanode;
-    
-    Key(ExtendedBlock block, DatanodeID datanode) {
-      this.block = block;
-      this.datanode = datanode;
-    }
-
-    /**
-     * Compare two ClientMmap regions that we're storing.
-     *
-     * When we append to a block, we bump the genstamp.  It is important to 
-     * compare the genStamp here.  That way, we will not return a shorter 
-     * mmap than required.
-     */
-    @Override
-    public int compareTo(Key o) {
-      return ComparisonChain.start().
-          compare(block.getBlockId(), o.block.getBlockId()).
-          compare(block.getGenerationStamp(), o.block.getGenerationStamp()).
-          compare(block.getBlockPoolId(), o.block.getBlockPoolId()).
-          compare(datanode, o.datanode).
-          result();
-    }
-
-    @Override
-    public boolean equals(Object rhs) {
-      if (rhs == null) {
-        return false;
-      }
-      try {
-        Key o = (Key)rhs;
-        return (compareTo(o) == 0);
-      } catch (ClassCastException e) {
-        return false;
-      }
-    }
-
-    @Override
-    public int hashCode() {
-      return block.hashCode() ^ datanode.hashCode();
-    }
-  }
-
-  /**
-   * Thread which handles expiring mmaps from the cache.
-   */
-  private static class CacheCleaner implements Runnable, Closeable {
-    private WeakReference<ClientMmapManager> managerRef;
-    private ScheduledFuture<?> future;
-    
-    CacheCleaner(ClientMmapManager manager) {
-      this.managerRef= new WeakReference<ClientMmapManager>(manager);
-    }
-
-    @Override
-    public void run() {
-      ClientMmapManager manager = managerRef.get();
-      if (manager == null) return;
-      long curTime = System.nanoTime();
-      try {
-        manager.lock.lock();
-        manager.evictStaleEntries(curTime);
-      } finally {
-        manager.lock.unlock();
-      }
-    }
-    
-    void setFuture(ScheduledFuture<?> future) {
-      this.future = future;
-    }
-
-    @Override
-    public void close() throws IOException {
-      future.cancel(false);
-    }
-  }
-
-  /**
-   * Evict entries which are older than curTime + timeoutNs from the cache.
-   *
-   * NOTE: you must call this function with the lock held.
-   */
-  private void evictStaleEntries(long curTime) {
-    if (closed) {
-      return;
-    }
-    Iterator<Entry<Long, ClientMmap>> iter =
-        evictable.entrySet().iterator(); 
-    while (iter.hasNext()) {
-      Entry<Long, ClientMmap> entry = iter.next();
-      if (entry.getKey() + timeoutNs >= curTime) {
-        return;
-      }
-      ClientMmap mmap = entry.getValue();
-      Key key = new Key(mmap.getBlock(), mmap.getDatanodeID());
-      mmaps.remove(key);
-      iter.remove();
-      mmap.unmap();
-    }
-  }
-
-  /**
-   * Evict one mmap object from the cache.
-   *
-   * NOTE: you must call this function with the lock held.
-   *
-   * @return                  True if an object was evicted; false if none
-   *                          could be evicted.
-   */
-  private boolean evictOne() {
-    Entry<Long, ClientMmap> entry = evictable.pollFirstEntry();
-    if (entry == null) {
-      // We don't want to try creating another mmap region, because the
-      // cache is full.
-      return false;
-    }
-    ClientMmap evictedMmap = entry.getValue(); 
-    Key evictedKey = new Key(evictedMmap.getBlock(), 
-                             evictedMmap.getDatanodeID());
-    mmaps.remove(evictedKey);
-    evictedMmap.unmap();
-    return true;
-  }
-
-  /**
-   * Create a new mmap object.
-   * 
-   * NOTE: you must call this function with the lock held.
-   *
-   * @param key              The key which describes this mmap.
-   * @param in               The input stream to use to create the mmap.
-   * @return                 The new mmap object, or null if there were
-   *                         insufficient resources.
-   * @throws IOException     If there was an I/O error creating the mmap.
-   */
-  private ClientMmap create(Key key, FileInputStream in) throws IOException {
-    if (mmaps.size() + 1 > cacheSize) {
-      if (!evictOne()) {
-        LOG.warn("mmap cache is full (with " + cacheSize + " elements) and " +
-              "nothing is evictable.  Ignoring request for mmap with " +
-              "datanodeID=" + key.datanode + ", " + "block=" + key.block);
-        return null;
-      }
-    }
-    // Create the condition variable that other threads may wait on.
-    Waitable<ClientMmap> waitable =
-        new Waitable<ClientMmap>(lock.newCondition());
-    mmaps.put(key, waitable);
-    // Load the entry
-    boolean success = false;
-    ClientMmap mmap = null;
-    try {
-      try {
-        lock.unlock();
-        mmap = ClientMmap.load(this, in, key.block, key.datanode);
-      } finally {
-        lock.lock();
-      }
-      if (cacheCleaner == null) {
-        cacheCleaner = new CacheCleaner(this);
-        ScheduledFuture<?> future = 
-            executor.scheduleAtFixedRate(cacheCleaner,
-                timeoutNs, timeoutNs / runsPerTimeout, TimeUnit.NANOSECONDS);
-        cacheCleaner.setFuture(future);
-      }
-      success = true;
-    } finally {
-      if (!success) {
-        LOG.warn("failed to create mmap for datanodeID=" + key.datanode +
-                  ", " + "block=" + key.block);
-        mmaps.remove(key);
-      }
-      waitable.provide(mmap);
-    }
-    if (LOG.isDebugEnabled()) {
-      LOG.info("created a new ClientMmap for block " + key.block +
-          " on datanode " + key.datanode);
-    }
-    return mmap;
-  }
-
-  /**
-   * Get or create an mmap region.
-   * 
-   * @param node       The DataNode that owns the block for this mmap region.
-   * @param block      The block ID, block pool ID, and generation stamp of 
-   *                     the block we want to read.
-   * @param in         An open file for this block.  This stream is only used
-   *                     if we have to create a new mmap; if we use an
-   *                     existing one, it is ignored.
-   *
-   * @return           The client mmap region.
-   */
-  public ClientMmap fetch(DatanodeID datanodeID, ExtendedBlock block,
-      FileInputStream in) throws IOException, InterruptedException {
-    LOG.debug("fetching mmap with datanodeID=" + datanodeID + ", " +
-        "block=" + block);
-    Key key = new Key(block, datanodeID);
-    ClientMmap mmap = null;
-    try {
-      lock.lock();
-      if (closed) {
-        throw new IOException("ClientMmapManager is closed.");
-      }
-      while (mmap == null) {
-        Waitable<ClientMmap> entry = mmaps.get(key);
-        if (entry == null) {
-          return create(key, in);
-        }
-        mmap = entry.await();
-      }
-      if (mmap.ref() == 1) {
-        // When going from nobody using the mmap (ref = 0) to somebody
-        // using the mmap (ref = 1), we must make the mmap un-evictable.
-        evictable.remove(mmap.getLastEvictableTimeNs());
-      }
-    }
-    finally {
-      lock.unlock();
-    }
-    if (LOG.isDebugEnabled()) {
-      LOG.debug("reusing existing mmap with datanodeID=" + datanodeID +
-              ", " + "block=" + block);
-    }
-    return mmap;
-  }
-
-  /**
-   * Make an mmap evictable.
-   * 
-   * When an mmap is evictable, it may be removed from the cache if necessary.
-   * mmaps can only be evictable if nobody is using them.
-   *
-   * @param mmap             The mmap to make evictable.
-   */
-  void makeEvictable(ClientMmap mmap) {
-    try {
-      lock.lock();
-      if (closed) {
-        // If this ClientMmapManager is closed, then don't bother with the
-        // cache; just close the mmap.
-        mmap.unmap();
-        return;
-      }
-      long now = System.nanoTime();
-      while (evictable.containsKey(now)) {
-        now++;
-      }
-      mmap.setLastEvictableTimeNs(now);
-      evictable.put(now, mmap);
-    } finally {
-      lock.unlock();
-    }
-  }
-
-  @Override
-  public void close() throws IOException {
-    try {
-      lock.lock();
-      closed = true;
-      IOUtils.cleanup(LOG, cacheCleaner);
-
-      // Unmap all the mmaps that nobody is using.
-      // The ones which are in use will be unmapped just as soon as people stop
-      // using them.
-      evictStaleEntries(Long.MAX_VALUE);
-
-      executor.shutdown();
-    } finally {
-      lock.unlock();
-    }
-  }
-
-  @VisibleForTesting
-  public interface ClientMmapVisitor {
-    void accept(ClientMmap mmap);
-  }
-
-  @VisibleForTesting
-  public synchronized void visitMmaps(ClientMmapVisitor visitor)
-      throws InterruptedException {
-    for (Waitable<ClientMmap> entry : mmaps.values()) {
-      visitor.accept(entry.await());
-    }
-  }
-
-  public void visitEvictable(ClientMmapVisitor visitor)
-      throws InterruptedException {
-    for (ClientMmap mmap : evictable.values()) {
-      visitor.accept(mmap);
-    }
-  }
-}

+ 881 - 0
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/client/ShortCircuitCache.java

@@ -0,0 +1,881 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs.client;
+
+import java.io.Closeable;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+
+import java.io.IOException;
+import java.nio.MappedByteBuffer;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.TreeMap;
+import java.util.concurrent.ScheduledFuture;
+import java.util.concurrent.ScheduledThreadPoolExecutor;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.locks.Condition;
+import java.util.concurrent.locks.ReentrantLock;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hdfs.ExtendedBlockId;
+import org.apache.hadoop.hdfs.DFSConfigKeys;
+import org.apache.hadoop.hdfs.client.ShortCircuitReplica;
+import org.apache.hadoop.io.IOUtils;
+import org.apache.hadoop.ipc.RetriableException;
+import org.apache.hadoop.security.token.SecretManager.InvalidToken;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.hadoop.util.Time;
+import org.apache.hadoop.util.Waitable;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.base.Preconditions;
+import com.google.common.util.concurrent.ThreadFactoryBuilder;
+
+/**
+ * The ShortCircuitCache tracks things which the client needs to access
+ * HDFS block files via short-circuit.
+ *
+ * These things include: memory-mapped regions, file descriptors, and shared
+ * memory areas for communicating with the DataNode.
+ */
+@InterfaceAudience.Private
+public class ShortCircuitCache implements Closeable {
+  public static final Log LOG = LogFactory.getLog(ShortCircuitCache.class);
+
+  /**
+   * Expiry thread which makes sure that the file descriptors get closed
+   * after a while.
+   */
+  private class CacheCleaner implements Runnable, Closeable {
+    private ScheduledFuture<?> future;
+
+    /**
+     * Run the CacheCleaner thread.
+     *
+     * Whenever a thread requests a ShortCircuitReplica object, we will make
+     * sure it gets one.  That ShortCircuitReplica object can then be re-used
+     * when another thread requests a ShortCircuitReplica object for the same
+     * block.  So in that sense, there is no maximum size to the cache.
+     *
+     * However, when a ShortCircuitReplica object is unreferenced by the
+     * thread(s) that are using it, it becomes evictable.  There are two
+     * separate eviction lists-- one for mmaped objects, and another for
+     * non-mmaped objects.  We do this in order to avoid having the regular
+     * files kick the mmaped files out of the cache too quickly.  Reusing
+     * an already-existing mmap gives a huge performance boost, since the
+     * page table entries don't have to be re-populated.  Both the mmap
+     * and non-mmap evictable lists have maximum sizes and maximum lifespans.
+     */
+    @Override
+    public void run() {
+      ShortCircuitCache.this.lock.lock();
+      try {
+        if (ShortCircuitCache.this.closed) return;
+        long curMs = Time.monotonicNow();
+
+        if (LOG.isDebugEnabled()) {
+          LOG.debug(this + ": cache cleaner running at " + curMs);
+        }
+
+        int numDemoted = demoteOldEvictableMmaped(curMs);
+        int numPurged = 0;
+        Long evictionTimeNs = Long.valueOf(0);
+        while (true) {
+          Entry<Long, ShortCircuitReplica> entry = 
+              evictableMmapped.ceilingEntry(evictionTimeNs);
+          if (entry == null) break;
+          evictionTimeNs = entry.getKey();
+          long evictionTimeMs = 
+              TimeUnit.MILLISECONDS.convert(evictionTimeNs, TimeUnit.NANOSECONDS);
+          if (evictionTimeMs + maxNonMmappedEvictableLifespanMs >= curMs) break;
+          ShortCircuitReplica replica = entry.getValue();
+          if (LOG.isTraceEnabled()) {
+            LOG.trace("CacheCleaner: purging " + replica + ": " + 
+                  StringUtils.getStackTrace(Thread.currentThread()));
+          }
+          purge(replica);
+          numPurged++;
+        }
+
+        if (LOG.isDebugEnabled()) {
+          LOG.debug(this + ": finishing cache cleaner run started at " +
+            curMs + ".  Demoted " + numDemoted + " mmapped replicas; " +
+            "purged " + numPurged + " replicas.");
+        }
+      } finally {
+        ShortCircuitCache.this.lock.unlock();
+      }
+    }
+
+    @Override
+    public void close() throws IOException {
+      if (future != null) {
+        future.cancel(false);
+      }
+    }
+
+    public void setFuture(ScheduledFuture<?> future) {
+      this.future = future;
+    }
+
+    /**
+     * Get the rate at which this cleaner thread should be scheduled.
+     *
+     * We do this by taking the minimum expiration time and dividing by 4.
+     *
+     * @return the rate in milliseconds at which this thread should be
+     *         scheduled.
+     */
+    public long getRateInMs() {
+      long minLifespanMs =
+          Math.min(maxNonMmappedEvictableLifespanMs,
+              maxEvictableMmapedLifespanMs);
+      long sampleTimeMs = minLifespanMs / 4;
+      return (sampleTimeMs < 1) ? 1 : sampleTimeMs;
+    }
+  }
+
+  public interface ShortCircuitReplicaCreator {
+    /**
+     * Attempt to create a ShortCircuitReplica object.
+     *
+     * This callback will be made without holding any locks.
+     *
+     * @return a non-null ShortCircuitReplicaInfo object.
+     */
+    ShortCircuitReplicaInfo createShortCircuitReplicaInfo();
+  }
+
+  /**
+   * Lock protecting the cache.
+   */
+  private final ReentrantLock lock = new ReentrantLock();
+
+  /**
+   * The executor service that runs the cacheCleaner.
+   */
+  private final ScheduledThreadPoolExecutor executor
+      = new ScheduledThreadPoolExecutor(1, new ThreadFactoryBuilder().
+          setDaemon(true).setNameFormat("ShortCircuitCache Cleaner").
+          build());
+
+  /**
+   * A map containing all ShortCircuitReplicaInfo objects, organized by Key.
+   * ShortCircuitReplicaInfo objects may contain a replica, or an InvalidToken
+   * exception.
+   */
+  private final HashMap<ExtendedBlockId, Waitable<ShortCircuitReplicaInfo>> 
+      replicaInfoMap = new HashMap<ExtendedBlockId,
+          Waitable<ShortCircuitReplicaInfo>>();
+
+  /**
+   * The CacheCleaner.  We don't create this and schedule it until it becomes
+   * necessary.
+   */
+  private CacheCleaner cacheCleaner;
+
+  /**
+   * Tree of evictable elements.
+   *
+   * Maps (unique) insertion time in nanoseconds to the element.
+   */
+  private final TreeMap<Long, ShortCircuitReplica> evictable =
+      new TreeMap<Long, ShortCircuitReplica>();
+
+  /**
+   * Maximum total size of the cache, including both mmapped and
+   * no$-mmapped elements.
+   */
+  private int maxTotalSize;
+
+  /**
+   * Non-mmaped elements older than this will be closed.
+   */
+  private long maxNonMmappedEvictableLifespanMs;
+
+  /**
+   * Tree of mmaped evictable elements.
+   *
+   * Maps (unique) insertion time in nanoseconds to the element.
+   */
+  private final TreeMap<Long, ShortCircuitReplica> evictableMmapped =
+      new TreeMap<Long, ShortCircuitReplica>();
+
+  /**
+   * Maximum number of mmaped evictable elements.
+   */
+  private int maxEvictableMmapedSize;
+
+  /**
+   * Mmaped elements older than this will be closed.
+   */
+  private final long maxEvictableMmapedLifespanMs;
+
+  /**
+   * The minimum number of milliseconds we'll wait after an unsuccessful
+   * mmap attempt before trying again.
+   */
+  private final long mmapRetryTimeoutMs;
+
+  /**
+   * How long we will keep replicas in the cache before declaring them
+   * to be stale.
+   */
+  private final long staleThresholdMs;
+
+  /**
+   * True if the ShortCircuitCache is closed.
+   */
+  private boolean closed = false;
+
+  /**
+   * Number of existing mmaps associated with this cache.
+   */
+  private int outstandingMmapCount = 0;
+
+  /**
+   * Create a {@link ShortCircuitCache} object from a {@link Configuration}
+   */
+  public static ShortCircuitCache fromConf(Configuration conf) {
+    return new ShortCircuitCache(
+        conf.getInt(DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_SIZE_KEY,
+            DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_SIZE_DEFAULT),
+        conf.getLong(DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_EXPIRY_MS_KEY,
+            DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_EXPIRY_MS_DEFAULT),
+        conf.getInt(DFSConfigKeys.DFS_CLIENT_MMAP_CACHE_SIZE,
+            DFSConfigKeys.DFS_CLIENT_MMAP_CACHE_SIZE_DEFAULT),
+        conf.getLong(DFSConfigKeys.DFS_CLIENT_MMAP_CACHE_TIMEOUT_MS,
+            DFSConfigKeys.DFS_CLIENT_MMAP_CACHE_TIMEOUT_MS_DEFAULT),
+        conf.getLong(DFSConfigKeys.DFS_CLIENT_MMAP_RETRY_TIMEOUT_MS,
+            DFSConfigKeys.DFS_CLIENT_MMAP_RETRY_TIMEOUT_MS_DEFAULT),
+        conf.getLong(DFSConfigKeys.DFS_CLIENT_SHORT_CIRCUIT_REPLICA_STALE_THRESHOLD_MS,
+            DFSConfigKeys.DFS_CLIENT_SHORT_CIRCUIT_REPLICA_STALE_THRESHOLD_MS_DEFAULT));
+  }
+
+  public ShortCircuitCache(int maxTotalSize, long maxNonMmappedEvictableLifespanMs,
+      int maxEvictableMmapedSize, long maxEvictableMmapedLifespanMs,
+      long mmapRetryTimeoutMs, long staleThresholdMs) {
+    Preconditions.checkArgument(maxTotalSize >= 0);
+    this.maxTotalSize = maxTotalSize;
+    Preconditions.checkArgument(maxNonMmappedEvictableLifespanMs >= 0);
+    this.maxNonMmappedEvictableLifespanMs = maxNonMmappedEvictableLifespanMs;
+    Preconditions.checkArgument(maxEvictableMmapedSize >= 0);
+    this.maxEvictableMmapedSize = maxEvictableMmapedSize;
+    Preconditions.checkArgument(maxEvictableMmapedLifespanMs >= 0);
+    this.maxEvictableMmapedLifespanMs = maxEvictableMmapedLifespanMs;
+    this.mmapRetryTimeoutMs = mmapRetryTimeoutMs;
+    this.staleThresholdMs = staleThresholdMs;
+  }
+
+  public long getMmapRetryTimeoutMs() {
+    return mmapRetryTimeoutMs;
+  }
+
+  public long getStaleThresholdMs() {
+    return staleThresholdMs;
+  }
+
+  /**
+   * Increment the reference count of a replica, and remove it from any free
+   * list it may be in.
+   *
+   * You must hold the cache lock while calling this function.
+   *
+   * @param replica      The replica we're removing.
+   */
+  private void ref(ShortCircuitReplica replica) {
+    lock.lock();
+    try {
+      Preconditions.checkArgument(replica.refCount > 0,
+          "can't ref " + replica + " because its refCount reached " +
+          replica.refCount);
+      Long evictableTimeNs = replica.getEvictableTimeNs();
+      replica.refCount++;
+      if (evictableTimeNs != null) {
+        String removedFrom = removeEvictable(replica);
+        if (LOG.isTraceEnabled()) {
+          LOG.trace(this + ": " + removedFrom +
+              " no longer contains " + replica + ".  refCount " +
+              (replica.refCount - 1) + " -> " + replica.refCount +
+              StringUtils.getStackTrace(Thread.currentThread()));
+
+        }
+      } else if (LOG.isTraceEnabled()) {
+        LOG.trace(this + ": replica  refCount " +
+            (replica.refCount - 1) + " -> " + replica.refCount +
+            StringUtils.getStackTrace(Thread.currentThread()));
+      }
+    } finally {
+      lock.unlock();
+    }
+  }
+
+  /**
+   * Unreference a replica.
+   *
+   * You must hold the cache lock while calling this function.
+   *
+   * @param replica   The replica being unreferenced.
+   */
+  void unref(ShortCircuitReplica replica) {
+    lock.lock();
+    try {
+      String addedString = "";
+      int newRefCount = --replica.refCount;
+      if (newRefCount == 0) {
+        // Close replica, since there are no remaining references to it.
+        Preconditions.checkArgument(replica.purged,
+            "Replica " + replica + " reached a refCount of 0 without " +
+            "being purged");
+        replica.close();
+      } else if (newRefCount == 1) {
+        Preconditions.checkState(null == replica.getEvictableTimeNs(),
+            "Replica " + replica + " had a refCount higher than 1, " +
+              "but was still evictable (evictableTimeNs = " +
+                replica.getEvictableTimeNs() + ")");
+        if (!replica.purged) {
+          // Add the replica to the end of an eviction list.
+          // Eviction lists are sorted by time.
+          if (replica.hasMmap()) {
+            insertEvictable(System.nanoTime(), replica, evictableMmapped);
+            addedString = "added to evictableMmapped, ";
+          } else {
+            insertEvictable(System.nanoTime(), replica, evictable);
+            addedString = "added to evictable, ";
+          }
+          trimEvictionMaps();
+        }
+      } else {
+        Preconditions.checkArgument(replica.refCount >= 0,
+            "replica's refCount went negative (refCount = " +
+            replica.refCount + " for " + replica + ")");
+      }
+      if (LOG.isTraceEnabled()) {
+        LOG.trace(this + ": unref replica " + replica +
+            ": " + addedString + " refCount " +
+            (newRefCount + 1) + " -> " + newRefCount +
+            StringUtils.getStackTrace(Thread.currentThread()));
+      }
+    } finally {
+      lock.unlock();
+    }
+  }
+
+  /**
+   * Demote old evictable mmaps into the regular eviction map.
+   *
+   * You must hold the cache lock while calling this function.
+   *
+   * @param now   Current time in monotonic milliseconds.
+   * @return      Number of replicas demoted.
+   */
+  private int demoteOldEvictableMmaped(long now) {
+    int numDemoted = 0;
+    boolean needMoreSpace = false;
+    Long evictionTimeNs = Long.valueOf(0);
+
+    while (true) {
+      Entry<Long, ShortCircuitReplica> entry = 
+          evictableMmapped.ceilingEntry(evictionTimeNs);
+      if (entry == null) break;
+      evictionTimeNs = entry.getKey();
+      long evictionTimeMs = 
+          TimeUnit.MILLISECONDS.convert(evictionTimeNs, TimeUnit.NANOSECONDS);
+      if (evictionTimeMs + maxEvictableMmapedLifespanMs >= now) {
+        if (evictableMmapped.size() < maxEvictableMmapedSize) {
+          break;
+        }
+        needMoreSpace = true;
+      }
+      ShortCircuitReplica replica = entry.getValue();
+      if (LOG.isTraceEnabled()) {
+        String rationale = needMoreSpace ? "because we need more space" : 
+            "because it's too old";
+        LOG.trace("demoteOldEvictable: demoting " + replica + ": " +
+            rationale + ": " +
+            StringUtils.getStackTrace(Thread.currentThread()));
+      }
+      removeEvictable(replica, evictableMmapped);
+      munmap(replica);
+      insertEvictable(evictionTimeNs, replica, evictable);
+      numDemoted++;
+    }
+    return numDemoted;
+  }
+
+  /**
+   * Trim the eviction lists.
+   */
+  private void trimEvictionMaps() {
+    long now = Time.monotonicNow();
+    demoteOldEvictableMmaped(now);
+
+    while (true) {
+      long evictableSize = evictable.size();
+      long evictableMmappedSize = evictableMmapped.size();
+      if (evictableSize + evictableMmappedSize <= maxTotalSize) {
+        return;
+      }
+      ShortCircuitReplica replica;
+      if (evictableSize == 0) {
+       replica = evictableMmapped.firstEntry().getValue();
+      } else {
+       replica = evictable.firstEntry().getValue();
+      }
+      if (LOG.isTraceEnabled()) {
+        LOG.trace(this + ": trimEvictionMaps is purging " +
+          StringUtils.getStackTrace(Thread.currentThread()));
+      }
+      purge(replica);
+    }
+  }
+
+  /**
+   * Munmap a replica, updating outstandingMmapCount.
+   *
+   * @param replica  The replica to munmap.
+   */
+  private void munmap(ShortCircuitReplica replica) {
+    replica.munmap();
+    outstandingMmapCount--;
+  }
+
+  /**
+   * Remove a replica from an evictable map.
+   *
+   * @param replica   The replica to remove.
+   * @return          The map it was removed from.
+   */
+  private String removeEvictable(ShortCircuitReplica replica) {
+    if (replica.hasMmap()) {
+      removeEvictable(replica, evictableMmapped);
+      return "evictableMmapped";
+    } else {
+      removeEvictable(replica, evictable);
+      return "evictable";
+    }
+  }
+
+  /**
+   * Remove a replica from an evictable map.
+   *
+   * @param replica   The replica to remove.
+   * @param map       The map to remove it from.
+   */
+  private void removeEvictable(ShortCircuitReplica replica,
+      TreeMap<Long, ShortCircuitReplica> map) {
+    Long evictableTimeNs = replica.getEvictableTimeNs();
+    Preconditions.checkNotNull(evictableTimeNs);
+    ShortCircuitReplica removed = map.remove(evictableTimeNs);
+    Preconditions.checkState(removed == replica,
+        "failed to make " + replica + " unevictable");
+    replica.setEvictableTimeNs(null);
+  }
+
+  /**
+   * Insert a replica into an evictable map.
+   *
+   * If an element already exists with this eviction time, we add a nanosecond
+   * to it until we find an unused key.
+   *
+   * @param evictionTimeNs   The eviction time in absolute nanoseconds.
+   * @param replica          The replica to insert.
+   * @param map              The map to insert it into.
+   */
+  private void insertEvictable(Long evictionTimeNs,
+      ShortCircuitReplica replica, TreeMap<Long, ShortCircuitReplica> map) {
+    while (map.containsKey(evictionTimeNs)) {
+      evictionTimeNs++;
+    }
+    Preconditions.checkState(null == replica.getEvictableTimeNs());
+    Long time = Long.valueOf(evictionTimeNs);
+    replica.setEvictableTimeNs(time);
+    map.put(time, replica);
+  }
+
+  /**
+   * Purge a replica from the cache.
+   *
+   * This doesn't necessarily close the replica, since there may be
+   * outstanding references to it.  However, it does mean the cache won't
+   * hand it out to anyone after this.
+   *
+   * You must hold the cache lock while calling this function.
+   *
+   * @param replica   The replica being removed.
+   */
+  private void purge(ShortCircuitReplica replica) {
+    boolean removedFromInfoMap = false;
+    String evictionMapName = null;
+    Preconditions.checkArgument(!replica.purged);
+    replica.purged = true;
+    Waitable<ShortCircuitReplicaInfo> val = replicaInfoMap.get(replica.key);
+    if (val != null) {
+      ShortCircuitReplicaInfo info = val.getVal();
+      if ((info != null) && (info.getReplica() == replica)) {
+        replicaInfoMap.remove(replica.key);
+        removedFromInfoMap = true;
+      }
+    }
+    Long evictableTimeNs = replica.getEvictableTimeNs();
+    if (evictableTimeNs != null) {
+      evictionMapName = removeEvictable(replica);
+    }
+    if (LOG.isTraceEnabled()) {
+      StringBuilder builder = new StringBuilder();
+      builder.append(this).append(": ").append(": removed ").
+          append(replica).append(" from the cache.");
+      if (removedFromInfoMap) {
+        builder.append("  Removed from the replicaInfoMap.");
+      }
+      if (evictionMapName != null) {
+        builder.append("  Removed from ").append(evictionMapName);
+      }
+      LOG.trace(builder.toString());
+    }
+    unref(replica);
+  }
+
+  /**
+   * Fetch or create a replica.
+   *
+   * You must hold the cache lock while calling this function.
+   *
+   * @param key          Key to use for lookup.
+   * @param creator      Replica creator callback.  Will be called without
+   *                     the cache lock being held.
+   *
+   * @return             Null if no replica could be found or created.
+   *                     The replica, otherwise.
+   */
+  public ShortCircuitReplicaInfo fetchOrCreate(ExtendedBlockId key,
+      ShortCircuitReplicaCreator creator) {
+    Waitable<ShortCircuitReplicaInfo> newWaitable = null;
+    lock.lock();
+    try {
+      ShortCircuitReplicaInfo info = null;
+      do {
+        if (closed) {
+          if (LOG.isTraceEnabled()) {
+            LOG.trace(this + ": can't fetchOrCreate " + key +
+                " because the cache is closed.");
+          }
+          return null;
+        }
+        Waitable<ShortCircuitReplicaInfo> waitable = replicaInfoMap.get(key);
+        if (waitable != null) {
+          try {
+            info = fetch(key, waitable);
+          } catch (RetriableException e) {
+            if (LOG.isDebugEnabled()) {
+              LOG.debug(this + ": retrying " + e.getMessage());
+            }
+            continue;
+          }
+        }
+      } while (false);
+      if (info != null) return info;
+      // We need to load the replica ourselves.
+      newWaitable = new Waitable<ShortCircuitReplicaInfo>(lock.newCondition());
+      replicaInfoMap.put(key, newWaitable);
+    } finally {
+      lock.unlock();
+    }
+    return create(key, creator, newWaitable);
+  }
+
+  /**
+   * Fetch an existing ReplicaInfo object.
+   *
+   * @param key       The key that we're using.
+   * @param waitable  The waitable object to wait on.
+   * @return          The existing ReplicaInfo object, or null if there is
+   *                  none.
+   *
+   * @throws RetriableException   If the caller needs to retry.
+   */
+  private ShortCircuitReplicaInfo fetch(ExtendedBlockId key,
+      Waitable<ShortCircuitReplicaInfo> waitable) throws RetriableException {
+    // Another thread is already in the process of loading this
+    // ShortCircuitReplica.  So we simply wait for it to complete.
+    ShortCircuitReplicaInfo info;
+    try {
+      if (LOG.isTraceEnabled()) {
+        LOG.trace(this + ": found waitable for " + key);
+      }
+      info = waitable.await();
+    } catch (InterruptedException e) {
+      LOG.info(this + ": interrupted while waiting for " + key);
+      Thread.currentThread().interrupt();
+      throw new RetriableException("interrupted");
+    }
+    if (info.getInvalidTokenException() != null) {
+      LOG.warn(this + ": could not get " + key + " due to InvalidToken " +
+            "exception.", info.getInvalidTokenException());
+      return info;
+    }
+    ShortCircuitReplica replica = info.getReplica();
+    if (replica == null) {
+      LOG.warn(this + ": failed to get " + key);
+      return info;
+    }
+    if (replica.purged) {
+      // Ignore replicas that have already been purged from the cache.
+      throw new RetriableException("Ignoring purged replica " +
+          replica + ".  Retrying.");
+    }
+    // Check if the replica is stale before using it.
+    // If it is, purge it and retry.
+    if (replica.isStale()) {
+      LOG.info(this + ": got stale replica " + replica + ".  Removing " +
+          "this replica from the replicaInfoMap and retrying.");
+      // Remove the cache's reference to the replica.  This may or may not
+      // trigger a close.
+      purge(replica);
+      throw new RetriableException("ignoring stale replica " + replica);
+    }
+    ref(replica);
+    return info;
+  }
+
+  private ShortCircuitReplicaInfo create(ExtendedBlockId key,
+      ShortCircuitReplicaCreator creator,
+      Waitable<ShortCircuitReplicaInfo> newWaitable) {
+    // Handle loading a new replica.
+    ShortCircuitReplicaInfo info = null;
+    try {
+      if (LOG.isTraceEnabled()) {
+        LOG.trace(this + ": loading " + key);
+      }
+      info = creator.createShortCircuitReplicaInfo();
+    } catch (RuntimeException e) {
+      LOG.warn(this + ": failed to load " + key, e);
+    }
+    if (info == null) info = new ShortCircuitReplicaInfo();
+    lock.lock();
+    try {
+      if (info.getReplica() != null) {
+        // On success, make sure the cache cleaner thread is running.
+        if (LOG.isTraceEnabled()) {
+          LOG.trace(this + ": successfully loaded " + info.getReplica());
+        }
+        startCacheCleanerThreadIfNeeded();
+        // Note: new ShortCircuitReplicas start with a refCount of 2,
+        // indicating that both this cache and whoever requested the 
+        // creation of the replica hold a reference.  So we don't need
+        // to increment the reference count here.
+      } else {
+        // On failure, remove the waitable from the replicaInfoMap.
+        Waitable<ShortCircuitReplicaInfo> waitableInMap = replicaInfoMap.get(key);
+        if (waitableInMap == newWaitable) replicaInfoMap.remove(key);
+        if (info.getInvalidTokenException() != null) {
+          LOG.warn(this + ": could not load " + key + " due to InvalidToken " +
+              "exception.", info.getInvalidTokenException());
+        } else {
+          LOG.warn(this + ": failed to load " + key);
+        }
+      }
+      newWaitable.provide(info);
+    } finally {
+      lock.unlock();
+    }
+    return info;
+  }
+
+  private void startCacheCleanerThreadIfNeeded() {
+    if (cacheCleaner == null) {
+      cacheCleaner = new CacheCleaner();
+      long rateMs = cacheCleaner.getRateInMs();
+      ScheduledFuture<?> future =
+          executor.scheduleAtFixedRate(cacheCleaner, rateMs, rateMs,
+              TimeUnit.MILLISECONDS);
+      cacheCleaner.setFuture(future);
+      if (LOG.isDebugEnabled()) {
+        LOG.debug(this + ": starting cache cleaner thread which will run " +
+          "every " + rateMs + " ms");
+      }
+    }
+  }
+
+  ClientMmap getOrCreateClientMmap(ShortCircuitReplica replica) {
+    Condition newCond;
+    lock.lock();
+    try {
+      while (replica.mmapData != null) {
+        if (replica.mmapData instanceof ClientMmap) {
+          ref(replica);
+          ClientMmap clientMmap = (ClientMmap)replica.mmapData;
+          clientMmap.ref();
+          return clientMmap;
+        } else if (replica.mmapData instanceof Long) {
+          long lastAttemptTimeMs = (Long)replica.mmapData;
+          long delta = Time.monotonicNow() - lastAttemptTimeMs;
+          if (delta < staleThresholdMs) {
+            if (LOG.isTraceEnabled()) {
+              LOG.trace(this + ": can't create client mmap for " +
+                  replica + " because we failed to " +
+                  "create one just " + delta + "ms ago.");
+            }
+            return null;
+          }
+          if (LOG.isTraceEnabled()) {
+            LOG.trace(this + ": retrying client mmap for " + replica +
+                ", " + delta + " ms after the previous failure.");
+          }
+        } else if (replica.mmapData instanceof Condition) {
+          Condition cond = (Condition)replica.mmapData;
+          cond.awaitUninterruptibly();
+        } else {
+          Preconditions.checkState(false, "invalid mmapData type " +
+              replica.mmapData.getClass().getName());
+        }
+      }
+      newCond = lock.newCondition();
+      replica.mmapData = newCond;
+    } finally {
+      lock.unlock();
+    }
+    MappedByteBuffer map = replica.loadMmapInternal();
+    lock.lock();
+    try {
+      if (map == null) {
+        replica.mmapData = Long.valueOf(Time.monotonicNow());
+        newCond.signalAll();
+        return null;
+      } else {
+        ClientMmap clientMmap = new ClientMmap(replica, map);
+        outstandingMmapCount++;
+        replica.mmapData = clientMmap;
+        ref(replica);
+        newCond.signalAll();
+        return clientMmap;
+      }
+    } finally {
+      lock.unlock();
+    }
+  }
+
+  /**
+   * Close the cache and free all associated resources.
+   */
+  public void close() {
+    try {
+      lock.lock();
+      if (closed) return;
+      closed = true;
+      LOG.info(this + ": closing");
+      maxNonMmappedEvictableLifespanMs = 0;
+      maxEvictableMmapedSize = 0;
+      // Close and join cacheCleaner thread.
+      IOUtils.cleanup(LOG, cacheCleaner);
+      // Purge all replicas.
+      while (true) {
+        Entry<Long, ShortCircuitReplica> entry = evictable.firstEntry();
+        if (entry == null) break;
+        purge(entry.getValue());
+      }
+      while (true) {
+        Entry<Long, ShortCircuitReplica> entry = evictableMmapped.firstEntry();
+        if (entry == null) break;
+        purge(entry.getValue());
+      }
+    } finally {
+      lock.unlock();
+    }
+  }
+
+  @VisibleForTesting // ONLY for testing
+  public interface CacheVisitor {
+    void visit(int numOutstandingMmaps,
+        Map<ExtendedBlockId, ShortCircuitReplica> replicas,
+        Map<ExtendedBlockId, InvalidToken> failedLoads,
+        Map<Long, ShortCircuitReplica> evictable,
+        Map<Long, ShortCircuitReplica> evictableMmapped);
+  }
+
+  @VisibleForTesting // ONLY for testing
+  public void accept(CacheVisitor visitor) {
+    lock.lock();
+    try {
+      Map<ExtendedBlockId, ShortCircuitReplica> replicas =
+          new HashMap<ExtendedBlockId, ShortCircuitReplica>();
+      Map<ExtendedBlockId, InvalidToken> failedLoads =
+          new HashMap<ExtendedBlockId, InvalidToken>();
+      for (Entry<ExtendedBlockId, Waitable<ShortCircuitReplicaInfo>> entry :
+            replicaInfoMap.entrySet()) {
+        Waitable<ShortCircuitReplicaInfo> waitable = entry.getValue();
+        if (waitable.hasVal()) {
+          if (waitable.getVal().getReplica() != null) {
+            replicas.put(entry.getKey(), waitable.getVal().getReplica());
+          } else {
+            // The exception may be null here, indicating a failed load that
+            // isn't the result of an invalid block token.
+            failedLoads.put(entry.getKey(),
+                waitable.getVal().getInvalidTokenException());
+          }
+        }
+      }
+      if (LOG.isDebugEnabled()) {
+        StringBuilder builder = new StringBuilder();
+        builder.append("visiting ").append(visitor.getClass().getName()).
+            append("with outstandingMmapCount=").append(outstandingMmapCount).
+            append(", replicas=");
+        String prefix = "";
+        for (Entry<ExtendedBlockId, ShortCircuitReplica> entry : replicas.entrySet()) {
+          builder.append(prefix).append(entry.getValue());
+          prefix = ",";
+        }
+        prefix = "";
+        builder.append(", failedLoads=");
+        for (Entry<ExtendedBlockId, InvalidToken> entry : failedLoads.entrySet()) {
+          builder.append(prefix).append(entry.getValue());
+          prefix = ",";
+        }
+        prefix = "";
+        builder.append(", evictable=");
+        for (Entry<Long, ShortCircuitReplica> entry : evictable.entrySet()) {
+          builder.append(prefix).append(entry.getKey()).
+              append(":").append(entry.getValue());
+          prefix = ",";
+        }
+        prefix = "";
+        builder.append(", evictableMmapped=");
+        for (Entry<Long, ShortCircuitReplica> entry : evictableMmapped.entrySet()) {
+          builder.append(prefix).append(entry.getKey()).
+              append(":").append(entry.getValue());
+          prefix = ",";
+        }
+        LOG.debug(builder.toString());
+      }
+      visitor.visit(outstandingMmapCount, replicas, failedLoads,
+            evictable, evictableMmapped);
+    } finally {
+      lock.unlock();
+    }
+  }
+
+  @Override
+  public String toString() {
+    return "ShortCircuitCache(0x" +
+        Integer.toHexString(System.identityHashCode(this)) + ")";
+  }
+}

+ 268 - 0
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/client/ShortCircuitReplica.java

@@ -0,0 +1,268 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs.client;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.nio.MappedByteBuffer;
+import java.nio.channels.FileChannel;
+import java.nio.channels.FileChannel.MapMode;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.hdfs.ExtendedBlockId;
+import org.apache.hadoop.hdfs.server.datanode.BlockMetadataHeader;
+import org.apache.hadoop.io.IOUtils;
+import org.apache.hadoop.io.nativeio.NativeIO;
+import org.apache.hadoop.util.Time;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.base.Preconditions;
+
+/**
+ * A ShortCircuitReplica object contains file descriptors for a block that
+ * we are reading via short-circuit local reads.
+ *
+ * The file descriptors can be shared between multiple threads because
+ * all the operations we perform are stateless-- i.e., we use pread
+ * instead of read, to avoid using the shared position state.
+ */
+@InterfaceAudience.Private
+public class ShortCircuitReplica {
+  public static final Log LOG = LogFactory.getLog(ShortCircuitCache.class);
+
+  /**
+   * Identifies this ShortCircuitReplica object.
+   */
+  final ExtendedBlockId key;
+
+  /**
+   * The block data input stream.
+   */
+  private final FileInputStream dataStream;
+
+  /**
+   * The block metadata input stream.
+   *
+   * TODO: make this nullable if the file has no checksums on disk.
+   */
+  private final FileInputStream metaStream;
+
+  /**
+   * Block metadata header.
+   */
+  private final BlockMetadataHeader metaHeader;
+
+  /**
+   * The cache we belong to.
+   */
+  private final ShortCircuitCache cache;
+
+  /**
+   * Monotonic time at which the replica was created.
+   */
+  private final long creationTimeMs;
+
+  /**
+   * Current mmap state.
+   *
+   * Protected by the cache lock.
+   */
+  Object mmapData;
+
+  /**
+   * True if this replica has been purged from the cache; false otherwise.
+   *
+   * Protected by the cache lock.
+   */
+  boolean purged = false;
+
+  /**
+   * Number of external references to this replica.  Replicas are referenced
+   * by the cache, BlockReaderLocal instances, and by ClientMmap instances.
+   * The number starts at 2 because when we create a replica, it is referenced
+   * by both the cache and the requester.
+   *
+   * Protected by the cache lock.
+   */
+  int refCount = 2;
+
+  /**
+   * The monotonic time in nanoseconds at which the replica became evictable, or
+   * null if it is not evictable.
+   *
+   * Protected by the cache lock.
+   */
+  private Long evictableTimeNs = null;
+
+  public ShortCircuitReplica(ExtendedBlockId key,
+      FileInputStream dataStream, FileInputStream metaStream,
+      ShortCircuitCache cache, long creationTimeMs) throws IOException {
+    this.key = key;
+    this.dataStream = dataStream;
+    this.metaStream = metaStream;
+    this.metaHeader =
+          BlockMetadataHeader.preadHeader(metaStream.getChannel());
+    if (metaHeader.getVersion() != 1) {
+      throw new IOException("invalid metadata header version " +
+          metaHeader.getVersion() + ".  Can only handle version 1.");
+    }
+    this.cache = cache;
+    this.creationTimeMs = creationTimeMs;
+  }
+
+  /**
+   * Decrement the reference count.
+   */
+  public void unref() {
+    cache.unref(this);
+  }
+
+  /**
+   * Check if the replica is stale.
+   *
+   * Must be called with the cache lock held.
+   */
+  boolean isStale() {
+    long deltaMs = Time.monotonicNow() - creationTimeMs;
+    long staleThresholdMs = cache.getStaleThresholdMs();
+    if (deltaMs > staleThresholdMs) {
+      if (LOG.isTraceEnabled()) {
+        LOG.trace(this + " is stale because it's " + deltaMs +
+            " ms old, and staleThresholdMs = " + staleThresholdMs);
+      }
+      return true;
+    } else {
+      if (LOG.isTraceEnabled()) {
+        LOG.trace(this + " is not stale because it's only " + deltaMs +
+            " ms old, and staleThresholdMs = " + staleThresholdMs);
+      }
+      return false;
+    }
+  }
+
+  /**
+   * Check if the replica has an associated mmap that has been fully loaded.
+   *
+   * Must be called with the cache lock held.
+   */
+  @VisibleForTesting
+  public boolean hasMmap() {
+    return ((mmapData != null) && (mmapData instanceof ClientMmap));
+  }
+
+  /**
+   * Free the mmap associated with this replica.
+   *
+   * Must be called with the cache lock held.
+   */
+  void munmap() {
+    ClientMmap clientMmap = (ClientMmap)mmapData;
+    NativeIO.POSIX.munmap(clientMmap.getMappedByteBuffer());
+    mmapData = null;
+  }
+
+  /**
+   * Close the replica.
+   *
+   * Must be called after there are no more references to the replica in the
+   * cache or elsewhere.
+   */
+  void close() {
+    Preconditions.checkState(refCount == 0,
+        "tried to close replica with refCount " + refCount + ": " + this);
+    Preconditions.checkState(purged,
+        "tried to close unpurged replica " + this);
+    if (hasMmap()) munmap();
+    IOUtils.cleanup(LOG, dataStream, metaStream);
+  }
+
+  public FileInputStream getDataStream() {
+    return dataStream;
+  }
+
+  public FileInputStream getMetaStream() {
+    return metaStream;
+  }
+
+  public BlockMetadataHeader getMetaHeader() {
+    return metaHeader;
+  }
+
+  public ExtendedBlockId getKey() {
+    return key;
+  }
+
+  public ClientMmap getOrCreateClientMmap() {
+    return cache.getOrCreateClientMmap(this);
+  }
+
+  MappedByteBuffer loadMmapInternal() {
+    try {
+      FileChannel channel = dataStream.getChannel();
+      return channel.map(MapMode.READ_ONLY, 0, channel.size());
+    } catch (IOException e) {
+      LOG.warn(this + ": mmap error", e);
+      return null;
+    } catch (RuntimeException e) {
+      LOG.warn(this + ": mmap error", e);
+      return null;
+    }
+  }
+
+  /**
+   * Get the evictable time in nanoseconds.
+   *
+   * Note: you must hold the cache lock to call this function.
+   *
+   * @return the evictable time in nanoseconds.
+   */
+  public Long getEvictableTimeNs() {
+    return evictableTimeNs;
+  }
+
+  /**
+   * Set the evictable time in nanoseconds.
+   *
+   * Note: you must hold the cache lock to call this function.
+   *
+   * @param evictableTimeNs   The evictable time in nanoseconds, or null
+   *                          to set no evictable time.
+   */
+  void setEvictableTimeNs(Long evictableTimeNs) {
+    this.evictableTimeNs = evictableTimeNs;
+  }
+
+  /**
+   * Convert the replica to a string for debugging purposes.
+   * Note that we can't take the lock here.
+   */
+  @Override
+  public String toString() {
+    return new StringBuilder().append("ShortCircuitReplica{").
+        append("key=").append(key).
+        append(", metaHeader.version=").append(metaHeader.getVersion()).
+        append(", metaHeader.checksum=").append(metaHeader.getChecksum()).
+        append(", ident=").append("0x").
+          append(Integer.toHexString(System.identityHashCode(this))).
+        append(", creationTimeMs=").append(creationTimeMs).
+        append("}").toString();
+  }
+}

+ 64 - 0
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/client/ShortCircuitReplicaInfo.java

@@ -0,0 +1,64 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs.client;
+
+import org.apache.hadoop.security.token.SecretManager.InvalidToken;
+
+public final class ShortCircuitReplicaInfo {
+  private final ShortCircuitReplica replica;
+  private final InvalidToken exc; 
+
+  public ShortCircuitReplicaInfo() {
+    this.replica = null;
+    this.exc = null;
+  }
+
+  public ShortCircuitReplicaInfo(ShortCircuitReplica replica) {
+    this.replica = replica;
+    this.exc = null;
+  }
+
+  public ShortCircuitReplicaInfo(InvalidToken exc) {
+    this.replica = null;
+    this.exc = exc;
+  }
+
+  public ShortCircuitReplica getReplica() {
+    return replica;
+  }
+
+  public InvalidToken getInvalidTokenException() {
+    return exc; 
+  }
+  
+  public String toString() {
+    StringBuilder builder = new StringBuilder();
+    String prefix = "";
+    builder.append("ShortCircuitReplicaInfo{");
+    if (replica != null) {
+      builder.append(prefix).append(replica);
+      prefix = ", ";
+    }
+    if (exc != null) {
+      builder.append(prefix).append(exc);
+      prefix = ", ";
+    }
+    builder.append("}");
+    return builder.toString();
+  }
+}

+ 2 - 1
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/LayoutVersion.java

@@ -121,7 +121,8 @@ public class LayoutVersion {
     ADD_DATANODE_AND_STORAGE_UUIDS(-49, "Replace StorageID with DatanodeUuid."
         + " Use distinct StorageUuid per storage directory."),
     ADD_LAYOUT_FLAGS(-50, "Add support for layout flags."),
-    CACHING(-51, "Support for cache pools and path-based caching");
+    CACHING(-51, "Support for cache pools and path-based caching"),
+    PROTOBUF_FORMAT(-52, "Use protobuf to serialize FSImage");
 
     private final FeatureInfo info;
 

+ 3 - 2
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/DatanodeProtocolClientSideTranslatorPB.java

@@ -103,9 +103,10 @@ public class DatanodeProtocolClientSideTranslatorPB implements
   private static DatanodeProtocolPB createNamenode(
       InetSocketAddress nameNodeAddr, Configuration conf,
       UserGroupInformation ugi) throws IOException {
-    return RPC.getProxy(DatanodeProtocolPB.class,
+    return RPC.getProtocolProxy(DatanodeProtocolPB.class,
         RPC.getProtocolVersion(DatanodeProtocolPB.class), nameNodeAddr, ugi,
-        conf, NetUtils.getSocketFactory(conf, DatanodeProtocolPB.class));
+        conf, NetUtils.getSocketFactory(conf, DatanodeProtocolPB.class),
+        org.apache.hadoop.ipc.Client.getPingInterval(conf), null).getProxy();
   }
 
   /** Create a {@link NameNode} proxy */

+ 85 - 2
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/security/token/delegation/DelegationTokenSecretManager.java

@@ -23,12 +23,16 @@ import java.io.DataOutputStream;
 import java.io.IOException;
 import java.io.InterruptedIOException;
 import java.net.InetSocketAddress;
+import java.util.ArrayList;
 import java.util.Iterator;
+import java.util.List;
+import java.util.Map.Entry;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
+import org.apache.hadoop.hdfs.server.namenode.FsImageProto.SecretManagerSection;
 import org.apache.hadoop.hdfs.server.namenode.NameNode;
 import org.apache.hadoop.hdfs.server.namenode.NameNode.OperationCategory;
 import org.apache.hadoop.hdfs.server.namenode.startupprogress.Phase;
@@ -46,6 +50,10 @@ import org.apache.hadoop.security.token.Token;
 import org.apache.hadoop.security.token.delegation.AbstractDelegationTokenSecretManager;
 import org.apache.hadoop.security.token.delegation.DelegationKey;
 
+import com.google.common.base.Preconditions;
+import com.google.common.collect.Lists;
+import com.google.protobuf.ByteString;
+
 /**
  * A HDFS specific delegation token secret manager.
  * The secret manager is responsible for generating and accepting the password
@@ -167,7 +175,45 @@ public class DelegationTokenSecretManager
     }
     serializerCompat.load(in);
   }
-  
+
+  public static class SecretManagerState {
+    public final SecretManagerSection section;
+    public final List<SecretManagerSection.DelegationKey> keys;
+    public final List<SecretManagerSection.PersistToken> tokens;
+
+    public SecretManagerState(
+        SecretManagerSection s,
+        List<SecretManagerSection.DelegationKey> keys,
+        List<SecretManagerSection.PersistToken> tokens) {
+      this.section = s;
+      this.keys = keys;
+      this.tokens = tokens;
+    }
+  }
+
+  public synchronized void loadSecretManagerState(SecretManagerState state)
+      throws IOException {
+    Preconditions.checkState(!running,
+        "Can't load state from image in a running SecretManager.");
+
+    currentId = state.section.getCurrentId();
+    delegationTokenSequenceNumber = state.section.getTokenSequenceNumber();
+    for (SecretManagerSection.DelegationKey k : state.keys) {
+      addKey(new DelegationKey(k.getId(), k.getExpiryDate(), k.hasKey() ? k
+          .getKey().toByteArray() : null));
+    }
+
+    for (SecretManagerSection.PersistToken t : state.tokens) {
+      DelegationTokenIdentifier id = new DelegationTokenIdentifier(new Text(
+          t.getOwner()), new Text(t.getRenewer()), new Text(t.getRealUser()));
+      id.setIssueDate(t.getIssueDate());
+      id.setMaxDate(t.getMaxDate());
+      id.setSequenceNumber(t.getSequenceNumber());
+      id.setMasterKeyId(t.getMasterKeyId());
+      addPersistedDelegationToken(id, t.getExpiryDate());
+    }
+  }
+
   /**
    * Store the current state of the SecretManager for persistence
    * 
@@ -179,7 +225,43 @@ public class DelegationTokenSecretManager
       String sdPath) throws IOException {
     serializerCompat.save(out, sdPath);
   }
-  
+
+  public synchronized SecretManagerState saveSecretManagerState() {
+    SecretManagerSection s = SecretManagerSection.newBuilder()
+        .setCurrentId(currentId)
+        .setTokenSequenceNumber(delegationTokenSequenceNumber)
+        .setNumKeys(allKeys.size()).setNumTokens(currentTokens.size()).build();
+    ArrayList<SecretManagerSection.DelegationKey> keys = Lists
+        .newArrayListWithCapacity(allKeys.size());
+    ArrayList<SecretManagerSection.PersistToken> tokens = Lists
+        .newArrayListWithCapacity(currentTokens.size());
+
+    for (DelegationKey v : allKeys.values()) {
+      SecretManagerSection.DelegationKey.Builder b = SecretManagerSection.DelegationKey
+          .newBuilder().setId(v.getKeyId()).setExpiryDate(v.getExpiryDate());
+      if (v.getEncodedKey() != null) {
+        b.setKey(ByteString.copyFrom(v.getEncodedKey()));
+      }
+      keys.add(b.build());
+    }
+
+    for (Entry<DelegationTokenIdentifier, DelegationTokenInformation> e : currentTokens
+        .entrySet()) {
+      DelegationTokenIdentifier id = e.getKey();
+      SecretManagerSection.PersistToken.Builder b = SecretManagerSection.PersistToken
+          .newBuilder().setOwner(id.getOwner().toString())
+          .setRenewer(id.getRenewer().toString())
+          .setRealUser(id.getRealUser().toString())
+          .setIssueDate(id.getIssueDate()).setMaxDate(id.getMaxDate())
+          .setSequenceNumber(id.getSequenceNumber())
+          .setMasterKeyId(id.getMasterKeyId())
+          .setExpiryDate(e.getValue().getRenewDate());
+      tokens.add(b.build());
+    }
+
+    return new SecretManagerState(s, keys, tokens);
+  }
+
   /**
    * This method is intended to be used only while reading edit logs.
    * 
@@ -431,4 +513,5 @@ public class DelegationTokenSecretManager
       prog.endStep(Phase.LOADING_FSIMAGE, step);
     }
   }
+
 }

+ 5 - 3
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockPlacementPolicyDefault.java

@@ -633,9 +633,11 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
     // check the communication traffic of the target machine
     if (considerLoad) {
       double avgLoad = 0;
-      int size = clusterMap.getNumOfLeaves();
-      if (size != 0 && stats != null) {
-        avgLoad = (double)stats.getTotalLoad()/size;
+      if (stats != null) {
+        int size = stats.getNumDatanodesInService();
+        if (size != 0) {
+          avgLoad = (double)stats.getTotalLoad()/size;
+        }
       }
       if (node.getXceiverCount() > (2.0 * avgLoad)) {
         logNodeIsNotChosen(storage, "the node is too busy ");

+ 96 - 89
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/common/JspHelper.java

@@ -18,7 +18,26 @@
 
 package org.apache.hadoop.hdfs.server.common;
 
-import com.google.common.base.Charsets;
+import static org.apache.hadoop.fs.CommonConfigurationKeys.DEFAULT_HADOOP_HTTP_STATIC_USER;
+import static org.apache.hadoop.fs.CommonConfigurationKeys.HADOOP_HTTP_STATIC_USER;
+
+import java.io.ByteArrayInputStream;
+import java.io.DataInputStream;
+import java.io.IOException;
+import java.io.UnsupportedEncodingException;
+import java.net.InetSocketAddress;
+import java.net.Socket;
+import java.net.URL;
+import java.net.URLEncoder;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.List;
+
+import javax.servlet.ServletContext;
+import javax.servlet.http.HttpServletRequest;
+import javax.servlet.jsp.JspWriter;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
@@ -27,10 +46,17 @@ import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.hdfs.BlockReader;
 import org.apache.hadoop.hdfs.BlockReaderFactory;
+import org.apache.hadoop.hdfs.ClientContext;
 import org.apache.hadoop.hdfs.DFSClient;
 import org.apache.hadoop.hdfs.DFSUtil;
+import org.apache.hadoop.hdfs.RemotePeerFactory;
+import org.apache.hadoop.hdfs.net.Peer;
 import org.apache.hadoop.hdfs.net.TcpPeerServer;
-import org.apache.hadoop.hdfs.protocol.*;
+import org.apache.hadoop.hdfs.protocol.DatanodeID;
+import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
+import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
+import org.apache.hadoop.hdfs.protocol.LocatedBlock;
+import org.apache.hadoop.hdfs.protocol.LocatedBlocks;
 import org.apache.hadoop.hdfs.security.token.block.BlockTokenIdentifier;
 import org.apache.hadoop.hdfs.security.token.block.DataEncryptionKey;
 import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenIdentifier;
@@ -53,22 +79,7 @@ import org.apache.hadoop.security.authorize.ProxyUsers;
 import org.apache.hadoop.security.token.Token;
 import org.apache.hadoop.util.VersionInfo;
 
-import javax.servlet.ServletContext;
-import javax.servlet.http.HttpServletRequest;
-import javax.servlet.jsp.JspWriter;
-
-import java.io.ByteArrayInputStream;
-import java.io.DataInputStream;
-import java.io.IOException;
-import java.io.UnsupportedEncodingException;
-import java.net.InetSocketAddress;
-import java.net.Socket;
-import java.net.URL;
-import java.net.URLEncoder;
-import java.util.*;
-
-import static org.apache.hadoop.fs.CommonConfigurationKeys.DEFAULT_HADOOP_HTTP_STATIC_USER;
-import static org.apache.hadoop.fs.CommonConfigurationKeys.HADOOP_HTTP_STATIC_USER;
+import com.google.common.base.Charsets;
 
 @InterfaceAudience.Private
 public class JspHelper {
@@ -168,101 +179,97 @@ public class JspHelper {
     }
     NodeRecord[] nodes = map.values().toArray(new NodeRecord[map.size()]);
     Arrays.sort(nodes, new NodeRecordComparator());
-    return bestNode(nodes, false, conf);
+    return bestNode(nodes, false);
   }
 
   public static DatanodeInfo bestNode(LocatedBlock blk, Configuration conf)
       throws IOException {
     DatanodeInfo[] nodes = blk.getLocations();
-    return bestNode(nodes, true, conf);
+    return bestNode(nodes, true);
   }
 
-  public static DatanodeInfo bestNode(DatanodeInfo[] nodes, boolean doRandom,
-      Configuration conf) throws IOException {
-    TreeSet<DatanodeInfo> deadNodes = new TreeSet<DatanodeInfo>();
-    DatanodeInfo chosenNode = null;
-    int failures = 0;
-    Socket s = null;
-    int index = -1;
+  private static DatanodeInfo bestNode(DatanodeInfo[] nodes, boolean doRandom)
+      throws IOException {
     if (nodes == null || nodes.length == 0) {
       throw new IOException("No nodes contain this block");
     }
-    while (s == null) {
-      if (chosenNode == null) {
-        do {
-          if (doRandom) {
-            index = DFSUtil.getRandom().nextInt(nodes.length);
-          } else {
-            index++;
-          }
-          chosenNode = nodes[index];
-        } while (deadNodes.contains(chosenNode));
-      }
-      chosenNode = nodes[index];
+    int l = 0;
+    while (l < nodes.length && !nodes[l].isDecommissioned()) {
+      ++l;
+    }
 
-      //just ping to check whether the node is alive
-      InetSocketAddress targetAddr = NetUtils.createSocketAddr(
-          chosenNode.getInfoAddr());
-        
-      try {
-        s = NetUtils.getDefaultSocketFactory(conf).createSocket();
-        s.connect(targetAddr, HdfsServerConstants.READ_TIMEOUT);
-        s.setSoTimeout(HdfsServerConstants.READ_TIMEOUT);
-      } catch (IOException e) {
-        deadNodes.add(chosenNode);
-        IOUtils.closeSocket(s);
-        s = null;
-        failures++;
-      }
-      if (failures == nodes.length)
-        throw new IOException("Could not reach the block containing the data. Please try again");
-        
+    if (l == 0) {
+      throw new IOException("No active nodes contain this block");
     }
-    s.close();
-    return chosenNode;
+
+    int index = doRandom ? DFSUtil.getRandom().nextInt(l) : 0;
+    return nodes[index];
   }
 
   public static void streamBlockInAscii(InetSocketAddress addr, String poolId,
       long blockId, Token<BlockTokenIdentifier> blockToken, long genStamp,
       long blockSize, long offsetIntoBlock, long chunkSizeToView,
-      JspWriter out, Configuration conf, DFSClient.Conf dfsConf,
-      DataEncryptionKey encryptionKey)
+      JspWriter out, final Configuration conf, DFSClient.Conf dfsConf,
+      final DataEncryptionKey encryptionKey)
           throws IOException {
     if (chunkSizeToView == 0) return;
-    Socket s = NetUtils.getDefaultSocketFactory(conf).createSocket();
-    s.connect(addr, HdfsServerConstants.READ_TIMEOUT);
-    s.setSoTimeout(HdfsServerConstants.READ_TIMEOUT);
-      
     int amtToRead = (int)Math.min(chunkSizeToView, blockSize - offsetIntoBlock);
       
-      // Use the block name for file name. 
-    String file = BlockReaderFactory.getFileName(addr, poolId, blockId);
-    BlockReader blockReader = BlockReaderFactory.newBlockReader(dfsConf, file,
-        new ExtendedBlock(poolId, blockId, 0, genStamp), blockToken,
-        offsetIntoBlock, amtToRead,  true,
-        "JspHelper", TcpPeerServer.peerFromSocketAndKey(s, encryptionKey),
-        new DatanodeID(addr.getAddress().getHostAddress(),
-            addr.getHostName(), poolId, addr.getPort(), 0, 0, 0), null,
-            null, null, false, CachingStrategy.newDefaultStrategy());
-        
+    BlockReader blockReader = new BlockReaderFactory(dfsConf).
+      setInetSocketAddress(addr).
+      setBlock(new ExtendedBlock(poolId, blockId, 0, genStamp)).
+      setFileName(BlockReaderFactory.getFileName(addr, poolId, blockId)).
+      setBlockToken(blockToken).
+      setStartOffset(offsetIntoBlock).
+      setLength(amtToRead).
+      setVerifyChecksum(true).
+      setClientName("JspHelper").
+      setClientCacheContext(ClientContext.getFromConf(conf)).
+      setDatanodeInfo(new DatanodeInfo(
+          new DatanodeID(addr.getAddress().getHostAddress(),
+              addr.getHostName(), poolId, addr.getPort(), 0, 0, 0))).
+      setCachingStrategy(CachingStrategy.newDefaultStrategy()).
+      setConfiguration(conf).
+      setRemotePeerFactory(new RemotePeerFactory() {
+        @Override
+        public Peer newConnectedPeer(InetSocketAddress addr)
+            throws IOException {
+          Peer peer = null;
+          Socket sock = NetUtils.getDefaultSocketFactory(conf).createSocket();
+          try {
+            sock.connect(addr, HdfsServerConstants.READ_TIMEOUT);
+            sock.setSoTimeout(HdfsServerConstants.READ_TIMEOUT);
+            peer = TcpPeerServer.peerFromSocketAndKey(sock, encryptionKey);
+          } finally {
+            if (peer == null) {
+              IOUtils.closeSocket(sock);
+            }
+          }
+          return peer;
+        }
+      }).
+      build();
+
     final byte[] buf = new byte[amtToRead];
-    int readOffset = 0;
-    int retries = 2;
-    while ( amtToRead > 0 ) {
-      int numRead = amtToRead;
-      try {
-        blockReader.readFully(buf, readOffset, amtToRead);
-      }
-      catch (IOException e) {
-        retries--;
-        if (retries == 0)
-          throw new IOException("Could not read data from datanode");
-        continue;
+    try {
+      int readOffset = 0;
+      int retries = 2;
+      while (amtToRead > 0) {
+        int numRead = amtToRead;
+        try {
+          blockReader.readFully(buf, readOffset, amtToRead);
+        } catch (IOException e) {
+          retries--;
+          if (retries == 0)
+            throw new IOException("Could not read data from datanode");
+          continue;
+        }
+        amtToRead -= numRead;
+        readOffset += numRead;
       }
-      amtToRead -= numRead;
-      readOffset += numRead;
+    } finally {
+      blockReader.close();
     }
-    blockReader.close();
     out.print(HtmlQuoting.quoteHtmlChars(new String(buf, Charsets.UTF_8)));
   }
 

+ 6 - 2
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockMetadataHeader.java

@@ -34,6 +34,8 @@ import org.apache.hadoop.util.DataChecksum;
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.classification.InterfaceStability;
 
+import com.google.common.annotations.VisibleForTesting;
+
 
 
 /**
@@ -55,7 +57,8 @@ public class BlockMetadataHeader {
   private short version;
   private DataChecksum checksum = null;
     
-  BlockMetadataHeader(short version, DataChecksum checksum) {
+  @VisibleForTesting
+  public BlockMetadataHeader(short version, DataChecksum checksum) {
     this.checksum = checksum;
     this.version = version;
   }
@@ -148,7 +151,8 @@ public class BlockMetadataHeader {
    * @return 
    * @throws IOException
    */
-  private static void writeHeader(DataOutputStream out, 
+  @VisibleForTesting
+  public static void writeHeader(DataOutputStream out, 
                                   BlockMetadataHeader header) 
                                   throws IOException {
     out.writeShort(header.getVersion());

+ 1 - 1
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNode.java

@@ -2525,7 +2525,7 @@ public class DataNode extends Configured
   /**
    * Get current value of the max balancer bandwidth in bytes per second.
    *
-   * @return bandwidth Blanacer bandwidth in bytes per second for this datanode.
+   * @return Balancer bandwidth in bytes per second for this datanode.
    */
   public Long getBalancerBandwidth() {
     DataXceiverServer dxcs =

+ 32 - 72
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsDatasetCache.java

@@ -37,12 +37,12 @@ import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicLong;
 
 import org.apache.commons.io.IOUtils;
-import org.apache.commons.lang.builder.HashCodeBuilder;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.classification.InterfaceStability;
 import org.apache.hadoop.fs.ChecksumException;
+import org.apache.hadoop.hdfs.ExtendedBlockId;
 import org.apache.hadoop.hdfs.DFSConfigKeys;
 import org.apache.hadoop.hdfs.protocol.BlockListAsLongs;
 import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
@@ -56,43 +56,6 @@ import org.apache.hadoop.io.nativeio.NativeIO;
 @InterfaceAudience.Private
 @InterfaceStability.Unstable
 public class FsDatasetCache {
-  /**
-   * Keys which identify MappableBlocks.
-   */
-  private static final class Key {
-    /**
-     * Block id.
-     */
-    final long id;
-
-    /**
-     * Block pool id.
-     */
-    final String bpid;
-
-    Key(long id, String bpid) {
-      this.id = id;
-      this.bpid = bpid;
-    }
-
-    @Override
-    public boolean equals(Object o) {
-      if (o == null) {
-        return false;
-      }
-      if (!(o.getClass() == getClass())) {
-        return false;
-      }
-      Key other = (Key)o;
-      return ((other.id == this.id) && (other.bpid.equals(this.bpid)));
-    }
-
-    @Override
-    public int hashCode() {
-      return new HashCodeBuilder().append(id).append(bpid).hashCode();
-    }
-  };
-
   /**
    * MappableBlocks that we know about.
    */
@@ -143,7 +106,8 @@ public class FsDatasetCache {
   /**
    * Stores MappableBlock objects and the states they're in.
    */
-  private final HashMap<Key, Value> mappableBlockMap = new HashMap<Key, Value>();
+  private final HashMap<ExtendedBlockId, Value> mappableBlockMap =
+      new HashMap<ExtendedBlockId, Value>();
 
   private final AtomicLong numBlocksCached = new AtomicLong(0);
 
@@ -260,12 +224,12 @@ public class FsDatasetCache {
    */
   synchronized List<Long> getCachedBlocks(String bpid) {
     List<Long> blocks = new ArrayList<Long>();
-    for (Iterator<Entry<Key, Value>> iter =
+    for (Iterator<Entry<ExtendedBlockId, Value>> iter =
         mappableBlockMap.entrySet().iterator(); iter.hasNext(); ) {
-      Entry<Key, Value> entry = iter.next();
-      if (entry.getKey().bpid.equals(bpid)) {
+      Entry<ExtendedBlockId, Value> entry = iter.next();
+      if (entry.getKey().getBlockPoolId().equals(bpid)) {
         if (entry.getValue().state.shouldAdvertise()) {
-          blocks.add(entry.getKey().id);
+          blocks.add(entry.getKey().getBlockId());
         }
       }
     }
@@ -278,7 +242,7 @@ public class FsDatasetCache {
   synchronized void cacheBlock(long blockId, String bpid,
       String blockFileName, long length, long genstamp,
       Executor volumeExecutor) {
-    Key key = new Key(blockId, bpid);
+    ExtendedBlockId key = new ExtendedBlockId(blockId, bpid);
     Value prevValue = mappableBlockMap.get(key);
     if (prevValue != null) {
       if (LOG.isDebugEnabled()) {
@@ -299,7 +263,7 @@ public class FsDatasetCache {
   }
 
   synchronized void uncacheBlock(String bpid, long blockId) {
-    Key key = new Key(blockId, bpid);
+    ExtendedBlockId key = new ExtendedBlockId(blockId, bpid);
     Value prevValue = mappableBlockMap.get(key);
 
     if (prevValue == null) {
@@ -344,12 +308,12 @@ public class FsDatasetCache {
    * Background worker that mmaps, mlocks, and checksums a block
    */
   private class CachingTask implements Runnable {
-    private final Key key; 
+    private final ExtendedBlockId key; 
     private final String blockFileName;
     private final long length;
     private final long genstamp;
 
-    CachingTask(Key key, String blockFileName, long length, long genstamp) {
+    CachingTask(ExtendedBlockId key, String blockFileName, long length, long genstamp) {
       this.key = key;
       this.blockFileName = blockFileName;
       this.length = length;
@@ -361,13 +325,13 @@ public class FsDatasetCache {
       boolean success = false;
       FileInputStream blockIn = null, metaIn = null;
       MappableBlock mappableBlock = null;
-      ExtendedBlock extBlk =
-          new ExtendedBlock(key.bpid, key.id, length, genstamp);
+      ExtendedBlock extBlk = new ExtendedBlock(key.getBlockPoolId(),
+          key.getBlockId(), length, genstamp);
       long newUsedBytes = usedBytesCount.reserve(length);
       if (newUsedBytes < 0) {
-        LOG.warn("Failed to cache block id " + key.id + ", pool " + key.bpid +
-            ": could not reserve " + length + " more bytes in the " +
-            "cache: " + DFSConfigKeys.DFS_DATANODE_MAX_LOCKED_MEMORY_KEY +
+        LOG.warn("Failed to cache " + key + ": could not reserve " + length +
+            " more bytes in the cache: " +
+            DFSConfigKeys.DFS_DATANODE_MAX_LOCKED_MEMORY_KEY +
             " of " + maxBytes + " exceeded.");
         numBlocksFailedToCache.incrementAndGet();
         return;
@@ -378,16 +342,15 @@ public class FsDatasetCache {
           metaIn = (FileInputStream)dataset.getMetaDataInputStream(extBlk)
               .getWrappedStream();
         } catch (ClassCastException e) {
-          LOG.warn("Failed to cache block with id " + key.id + ", pool " +
-                key.bpid + ": Underlying blocks are not backed by files.", e);
+          LOG.warn("Failed to cache " + key +
+              ": Underlying blocks are not backed by files.", e);
           return;
         } catch (FileNotFoundException e) {
-          LOG.info("Failed to cache block with id " + key.id + ", pool " +
-                key.bpid + ": failed to find backing files.");
+          LOG.info("Failed to cache " + key + ": failed to find backing " +
+              "files.");
           return;
         } catch (IOException e) {
-          LOG.warn("Failed to cache block with id " + key.id + ", pool " +
-                key.bpid + ": failed to open file", e);
+          LOG.warn("Failed to cache " + key + ": failed to open file", e);
           return;
         }
         try {
@@ -395,11 +358,10 @@ public class FsDatasetCache {
               load(length, blockIn, metaIn, blockFileName);
         } catch (ChecksumException e) {
           // Exception message is bogus since this wasn't caused by a file read
-          LOG.warn("Failed to cache block " + key.id + " in " + key.bpid + ": " +
-                   "checksum verification failed.");
+          LOG.warn("Failed to cache " + key + ": checksum verification failed.");
           return;
         } catch (IOException e) {
-          LOG.warn("Failed to cache block " + key.id + " in " + key.bpid, e);
+          LOG.warn("Failed to cache " + key, e);
           return;
         }
         synchronized (FsDatasetCache.this) {
@@ -409,15 +371,14 @@ public class FsDatasetCache {
                                    value.state == State.CACHING_CANCELLED);
           if (value.state == State.CACHING_CANCELLED) {
             mappableBlockMap.remove(key);
-            LOG.warn("Caching of block " + key.id + " in " + key.bpid +
-                " was cancelled.");
+            LOG.warn("Caching of " + key + " was cancelled.");
             return;
           }
           mappableBlockMap.put(key, new Value(mappableBlock, State.CACHED));
         }
         if (LOG.isDebugEnabled()) {
-          LOG.debug("Successfully cached block " + key.id + " in " + key.bpid +
-              ".  We are now caching " + newUsedBytes + " bytes in total.");
+          LOG.debug("Successfully cached " + key + ".  We are now caching " +
+              newUsedBytes + " bytes in total.");
         }
         numBlocksCached.addAndGet(1);
         success = true;
@@ -425,9 +386,8 @@ public class FsDatasetCache {
         if (!success) {
           newUsedBytes = usedBytesCount.release(length);
           if (LOG.isDebugEnabled()) {
-            LOG.debug("Caching of block " + key.id + " in " +
-              key.bpid + " was aborted.  We are now caching only " +
-              newUsedBytes + " + bytes in total.");
+            LOG.debug("Caching of " + key + " was aborted.  We are now " +
+                "caching only " + newUsedBytes + " + bytes in total.");
           }
           IOUtils.closeQuietly(blockIn);
           IOUtils.closeQuietly(metaIn);
@@ -445,9 +405,9 @@ public class FsDatasetCache {
   }
 
   private class UncachingTask implements Runnable {
-    private final Key key; 
+    private final ExtendedBlockId key; 
 
-    UncachingTask(Key key) {
+    UncachingTask(ExtendedBlockId key) {
       this.key = key;
     }
 
@@ -470,8 +430,8 @@ public class FsDatasetCache {
           usedBytesCount.release(value.mappableBlock.getLength());
       numBlocksCached.addAndGet(-1);
       if (LOG.isDebugEnabled()) {
-        LOG.debug("Uncaching of block " + key.id + " in " + key.bpid +
-            " completed.  usedBytes = " + newUsedBytes);
+        LOG.debug("Uncaching of " + key + " completed.  " +
+            "usedBytes = " + newUsedBytes);
       }
     }
   }

+ 129 - 18
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/CacheManager.java

@@ -50,8 +50,10 @@ import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.BatchedRemoteIterator.BatchedListEntries;
 import org.apache.hadoop.fs.CacheFlag;
 import org.apache.hadoop.fs.InvalidRequestException;
+import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.UnresolvedLinkException;
 import org.apache.hadoop.fs.permission.FsAction;
+import org.apache.hadoop.fs.permission.FsPermission;
 import org.apache.hadoop.hdfs.DFSUtil;
 import org.apache.hadoop.hdfs.protocol.CacheDirective;
 import org.apache.hadoop.hdfs.protocol.CacheDirectiveEntry;
@@ -62,11 +64,15 @@ import org.apache.hadoop.hdfs.protocol.CachePoolEntry;
 import org.apache.hadoop.hdfs.protocol.CachePoolInfo;
 import org.apache.hadoop.hdfs.protocol.DatanodeID;
 import org.apache.hadoop.hdfs.protocol.LocatedBlock;
+import org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos.CacheDirectiveInfoProto;
+import org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos.CachePoolInfoProto;
+import org.apache.hadoop.hdfs.protocolPB.PBHelper;
 import org.apache.hadoop.hdfs.server.blockmanagement.BlockManager;
 import org.apache.hadoop.hdfs.server.blockmanagement.CacheReplicationMonitor;
 import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor;
 import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor.CachedBlocksList;
 import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor.CachedBlocksList.Type;
+import org.apache.hadoop.hdfs.server.namenode.FsImageProto.CacheManagerSection;
 import org.apache.hadoop.hdfs.server.namenode.metrics.NameNodeMetrics;
 import org.apache.hadoop.hdfs.server.namenode.snapshot.Snapshot;
 import org.apache.hadoop.hdfs.server.namenode.startupprogress.Phase;
@@ -81,6 +87,7 @@ import org.apache.hadoop.util.LightWeightGSet;
 import org.apache.hadoop.util.Time;
 
 import com.google.common.annotations.VisibleForTesting;
+import com.google.common.collect.Lists;
 
 /**
  * The Cache Manager handles caching on DataNodes.
@@ -167,6 +174,19 @@ public final class CacheManager {
    */
   private CacheReplicationMonitor monitor;
 
+  public static final class PersistState {
+    public final CacheManagerSection section;
+    public final List<CachePoolInfoProto> pools;
+    public final List<CacheDirectiveInfoProto> directives;
+
+    public PersistState(CacheManagerSection section,
+        List<CachePoolInfoProto> pools, List<CacheDirectiveInfoProto> directives) {
+      this.section = section;
+      this.pools = pools;
+      this.directives = directives;
+    }
+  }
+
   CacheManager(FSNamesystem namesystem, Configuration conf,
       BlockManager blockManager) {
     this.namesystem = namesystem;
@@ -944,6 +964,64 @@ public final class CacheManager {
     serializerCompat.save(out, sdPath);
   }
 
+  public PersistState saveState() throws IOException {
+    ArrayList<CachePoolInfoProto> pools = Lists
+        .newArrayListWithCapacity(cachePools.size());
+    ArrayList<CacheDirectiveInfoProto> directives = Lists
+        .newArrayListWithCapacity(directivesById.size());
+
+    for (CachePool pool : cachePools.values()) {
+      CachePoolInfo p = pool.getInfo(true);
+      CachePoolInfoProto.Builder b = CachePoolInfoProto.newBuilder()
+          .setPoolName(p.getPoolName());
+
+      if (p.getOwnerName() != null)
+        b.setOwnerName(p.getOwnerName());
+
+      if (p.getGroupName() != null)
+        b.setGroupName(p.getGroupName());
+
+      if (p.getMode() != null)
+        b.setMode(p.getMode().toShort());
+
+      if (p.getLimit() != null)
+        b.setLimit(p.getLimit());
+
+      pools.add(b.build());
+    }
+
+    for (CacheDirective directive : directivesById.values()) {
+      CacheDirectiveInfo info = directive.toInfo();
+      CacheDirectiveInfoProto.Builder b = CacheDirectiveInfoProto.newBuilder()
+          .setId(info.getId());
+
+      if (info.getPath() != null) {
+        b.setPath(info.getPath().toUri().getPath());
+      }
+
+      if (info.getReplication() != null) {
+        b.setReplication(info.getReplication());
+      }
+
+      if (info.getPool() != null) {
+        b.setPool(info.getPool());
+      }
+
+      Expiration expiry = info.getExpiration();
+      if (expiry != null) {
+        assert (!expiry.isRelative());
+        b.setExpiration(PBHelper.convert(expiry));
+      }
+
+      directives.add(b.build());
+    }
+    CacheManagerSection s = CacheManagerSection.newBuilder()
+        .setNextDirectiveId(nextDirectiveId).setNumPools(pools.size())
+        .setNumDirectives(directives.size()).build();
+
+    return new PersistState(s, pools, directives);
+  }
+
   /**
    * Reloads CacheManager state from the passed DataInput. Used during namenode
    * startup to restore CacheManager state from an FSImage.
@@ -954,6 +1032,56 @@ public final class CacheManager {
     serializerCompat.load(in);
   }
 
+  public void loadState(PersistState s) throws IOException {
+    nextDirectiveId = s.section.getNextDirectiveId();
+    for (CachePoolInfoProto p : s.pools) {
+      CachePoolInfo info = new CachePoolInfo(p.getPoolName());
+      if (p.hasOwnerName())
+        info.setOwnerName(p.getOwnerName());
+
+      if (p.hasGroupName())
+        info.setGroupName(p.getGroupName());
+
+      if (p.hasMode())
+        info.setMode(new FsPermission((short) p.getMode()));
+
+      if (p.hasLimit())
+        info.setLimit(p.getLimit());
+
+      addCachePool(info);
+    }
+
+    for (CacheDirectiveInfoProto p : s.directives) {
+      // Get pool reference by looking it up in the map
+      final String poolName = p.getPool();
+      CacheDirective directive = new CacheDirective(p.getId(), new Path(
+          p.getPath()).toUri().getPath(), (short) p.getReplication(), p
+          .getExpiration().getMillis());
+      addCacheDirective(poolName, directive);
+    }
+  }
+
+  private void addCacheDirective(final String poolName,
+      final CacheDirective directive) throws IOException {
+    CachePool pool = cachePools.get(poolName);
+    if (pool == null) {
+      throw new IOException("Directive refers to pool " + poolName
+          + ", which does not exist.");
+    }
+    boolean addedDirective = pool.getDirectiveList().add(directive);
+    assert addedDirective;
+    if (directivesById.put(directive.getId(), directive) != null) {
+      throw new IOException("A directive with ID " + directive.getId()
+          + " already exists");
+    }
+    List<CacheDirective> directives = directivesByPath.get(directive.getPath());
+    if (directives == null) {
+      directives = new LinkedList<CacheDirective>();
+      directivesByPath.put(directive.getPath(), directives);
+    }
+    directives.add(directive);
+  }
+
   private final class SerializerCompat {
     private void save(DataOutputStream out, String sdPath) throws IOException {
       out.writeLong(nextDirectiveId);
@@ -1036,27 +1164,10 @@ public final class CacheManager {
         CacheDirectiveInfo info = FSImageSerialization.readCacheDirectiveInfo(in);
         // Get pool reference by looking it up in the map
         final String poolName = info.getPool();
-        CachePool pool = cachePools.get(poolName);
-        if (pool == null) {
-          throw new IOException("Directive refers to pool " + poolName +
-              ", which does not exist.");
-        }
         CacheDirective directive =
             new CacheDirective(info.getId(), info.getPath().toUri().getPath(),
                 info.getReplication(), info.getExpiration().getAbsoluteMillis());
-        boolean addedDirective = pool.getDirectiveList().add(directive);
-        assert addedDirective;
-        if (directivesById.put(directive.getId(), directive) != null) {
-          throw new IOException("A directive with ID " + directive.getId() +
-              " already exists");
-        }
-        List<CacheDirective> directives =
-            directivesByPath.get(directive.getPath());
-        if (directives == null) {
-          directives = new LinkedList<CacheDirective>();
-          directivesByPath.put(directive.getPath(), directives);
-        }
-        directives.add(directive);
+        addCacheDirective(poolName, directive);
         counter.increment();
       }
       prog.endStep(Phase.LOADING_FSIMAGE, step);

+ 2 - 0
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ClusterJspHelper.java

@@ -587,6 +587,8 @@ class ClusterJspHelper {
         toXmlItemBlockWithLink(doc, nn.host, nn.httpAddress, "NameNode");
         toXmlItemBlock(doc, "Blockpool Used",
             StringUtils.byteDesc(nn.bpUsed));
+        toXmlItemBlock(doc, "Blockpool Used%",
+            DFSUtil.percent2String(DFSUtil.getPercentUsed(nn.bpUsed, total)));
         toXmlItemBlock(doc, "Files And Directories",
             Long.toString(nn.filesAndDirectories));
         toXmlItemBlock(doc, "Blocks", Long.toString(nn.blocksCount));

+ 6 - 0
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSClusterStats.java

@@ -42,6 +42,12 @@ public interface FSClusterStats {
    *         for writing targets, and false otherwise.
    */
   public boolean isAvoidingStaleDataNodesForWrite();
+
+  /**
+   * Indicates number of datanodes that are in service.
+   * @return Number of datanodes that are both alive and not decommissioned.
+   */
+  public int getNumDatanodesInService();
 }
     
     

+ 2 - 3
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSImage.java

@@ -813,8 +813,7 @@ public class FSImage implements Closeable {
    */
   private void loadFSImage(File curFile, MD5Hash expectedMd5,
       FSNamesystem target, MetaRecoveryContext recovery) throws IOException {
-    FSImageFormat.Loader loader = new FSImageFormat.Loader(
-        conf, target);
+    FSImageFormat.LoaderDelegator loader = FSImageFormat.newLoader(conf, target);
     loader.load(curFile);
     target.setBlockPoolId(this.getBlockPoolID());
 
@@ -843,7 +842,7 @@ public class FSImage implements Closeable {
     File newFile = NNStorage.getStorageFile(sd, NameNodeFile.IMAGE_NEW, txid);
     File dstFile = NNStorage.getStorageFile(sd, NameNodeFile.IMAGE, txid);
     
-    FSImageFormat.Saver saver = new FSImageFormat.Saver(context);
+    FSImageFormatProtobuf.Saver saver = new FSImageFormatProtobuf.Saver(context);
     FSImageCompression compression = FSImageCompression.createCompression(conf);
     saver.save(newFile, compression);
     

+ 5 - 1
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSImageCompression.java

@@ -57,6 +57,10 @@ class FSImageCompression {
     imageCodec = codec;
   }
 
+  public CompressionCodec getImageCodec() {
+    return imageCodec;
+  }
+
   /**
    * Create a "noop" compression - i.e. uncompressed
    */
@@ -89,7 +93,7 @@ class FSImageCompression {
    * Create a compression instance using the codec specified by
    * <code>codecClassName</code>
    */
-  private static FSImageCompression createCompression(Configuration conf,
+  static FSImageCompression createCompression(Configuration conf,
                                                       String codecClassName)
     throws IOException {
 

+ 68 - 7
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSImageFormat.java

@@ -68,12 +68,13 @@ import org.apache.hadoop.hdfs.server.namenode.startupprogress.StartupProgress.Co
 import org.apache.hadoop.hdfs.server.namenode.startupprogress.Step;
 import org.apache.hadoop.hdfs.server.namenode.startupprogress.StepType;
 import org.apache.hadoop.hdfs.util.ReadOnlyList;
+import org.apache.hadoop.io.IOUtils;
 import org.apache.hadoop.io.MD5Hash;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.util.StringUtils;
 
-import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.Preconditions;
+import com.google.common.annotations.VisibleForTesting;
 
 /**
  * Contains inner classes for reading or writing the on-disk format for
@@ -180,16 +181,74 @@ import com.google.common.base.Preconditions;
 @InterfaceStability.Evolving
 public class FSImageFormat {
   private static final Log LOG = FSImage.LOG;
-  
+
   // Static-only class
   private FSImageFormat() {}
-  
+
+  interface AbstractLoader {
+    MD5Hash getLoadedImageMd5();
+    long getLoadedImageTxId();
+  }
+
+  static class LoaderDelegator implements AbstractLoader {
+    private AbstractLoader impl;
+    private final Configuration conf;
+    private final FSNamesystem fsn;
+
+    LoaderDelegator(Configuration conf, FSNamesystem fsn) {
+      this.conf = conf;
+      this.fsn = fsn;
+    }
+
+    @Override
+    public MD5Hash getLoadedImageMd5() {
+      return impl.getLoadedImageMd5();
+    }
+
+    @Override
+    public long getLoadedImageTxId() {
+      return impl.getLoadedImageTxId();
+    }
+
+    public void load(File file) throws IOException {
+      Preconditions.checkState(impl == null, "Image already loaded!");
+
+      FileInputStream is = null;
+      try {
+        is = new FileInputStream(file);
+        byte[] magic = new byte[FSImageUtil.MAGIC_HEADER.length];
+        IOUtils.readFully(is, magic, 0, magic.length);
+        if (Arrays.equals(magic, FSImageUtil.MAGIC_HEADER)) {
+          FSImageFormatProtobuf.Loader loader = new FSImageFormatProtobuf.Loader(
+              conf, fsn);
+          impl = loader;
+          loader.load(file);
+        } else {
+          Loader loader = new Loader(conf, fsn);
+          impl = loader;
+          loader.load(file);
+        }
+
+      } finally {
+        IOUtils.cleanup(LOG, is);
+      }
+    }
+  }
+
+  /**
+   * Construct a loader class to load the image. It chooses the loader based on
+   * the layout version.
+   */
+  public static LoaderDelegator newLoader(Configuration conf, FSNamesystem fsn) {
+    return new LoaderDelegator(conf, fsn);
+  }
+
   /**
    * A one-shot class responsible for loading an image. The load() function
    * should be called once, after which the getter methods may be used to retrieve
    * information about the image that was loaded, if loading was successful.
    */
-  public static class Loader {
+  public static class Loader implements AbstractLoader {
     private final Configuration conf;
     /** which namesystem this loader is working for */
     private final FSNamesystem namesystem;
@@ -214,12 +273,14 @@ public class FSImageFormat {
      * Return the MD5 checksum of the image that has been loaded.
      * @throws IllegalStateException if load() has not yet been called.
      */
-    MD5Hash getLoadedImageMd5() {
+    @Override
+    public MD5Hash getLoadedImageMd5() {
       checkLoaded();
       return imgDigest;
     }
 
-    long getLoadedImageTxId() {
+    @Override
+    public long getLoadedImageTxId() {
       checkLoaded();
       return imgTxId;
     }
@@ -242,7 +303,7 @@ public class FSImageFormat {
       }
     }
 
-    void load(File curFile) throws IOException {
+    public void load(File curFile) throws IOException {
       checkNotLoaded();
       assert curFile != null : "curFile is null";
 

+ 426 - 0
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSImageFormatPBINode.java

@@ -0,0 +1,426 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hdfs.server.namenode;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.HadoopIllegalArgumentException;
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.fs.permission.FsPermission;
+import org.apache.hadoop.fs.permission.PermissionStatus;
+import org.apache.hadoop.hdfs.protocol.Block;
+import org.apache.hadoop.hdfs.protocol.proto.HdfsProtos.BlockProto;
+import org.apache.hadoop.hdfs.protocolPB.PBHelper;
+import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfo;
+import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfoUnderConstruction;
+import org.apache.hadoop.hdfs.server.blockmanagement.BlockManager;
+import org.apache.hadoop.hdfs.server.namenode.FSImageFormatProtobuf.SaverContext;
+import org.apache.hadoop.hdfs.server.namenode.FsImageProto.FileSummary;
+import org.apache.hadoop.hdfs.server.namenode.FsImageProto.FilesUnderConstructionSection.FileUnderConstructionEntry;
+import org.apache.hadoop.hdfs.server.namenode.FsImageProto.INodeDirectorySection;
+import org.apache.hadoop.hdfs.server.namenode.FsImageProto.INodeSection;
+import org.apache.hadoop.hdfs.server.namenode.snapshot.Snapshot;
+import org.apache.hadoop.hdfs.util.ReadOnlyList;
+
+import com.google.common.base.Preconditions;
+import com.google.protobuf.ByteString;
+
+@InterfaceAudience.Private
+public final class FSImageFormatPBINode {
+  private final static long USER_GROUP_STRID_MASK = (1 << 24) - 1;
+  private final static int USER_STRID_OFFSET = 40;
+  private final static int GROUP_STRID_OFFSET = 16;
+  private static final Log LOG = LogFactory.getLog(FSImageFormatProtobuf.class);
+
+  public final static class Loader {
+    public static PermissionStatus loadPermission(long id,
+        final String[] stringTable) {
+      short perm = (short) (id & ((1 << GROUP_STRID_OFFSET) - 1));
+      int gsid = (int) ((id >> GROUP_STRID_OFFSET) & USER_GROUP_STRID_MASK);
+      int usid = (int) ((id >> USER_STRID_OFFSET) & USER_GROUP_STRID_MASK);
+      return new PermissionStatus(stringTable[usid], stringTable[gsid],
+          new FsPermission(perm));
+    }
+
+    public static INodeDirectory loadINodeDirectory(INodeSection.INode n,
+        final String[] stringTable) {
+      assert n.getType() == INodeSection.INode.Type.DIRECTORY;
+      INodeSection.INodeDirectory d = n.getDirectory();
+
+      final PermissionStatus permissions = loadPermission(d.getPermission(),
+          stringTable);
+      final INodeDirectory dir = new INodeDirectory(n.getId(), n.getName()
+          .toByteArray(), permissions, d.getModificationTime());
+
+      final long nsQuota = d.getNsQuota(), dsQuota = d.getDsQuota();
+      if (nsQuota >= 0 || dsQuota >= 0) {
+        dir.addDirectoryWithQuotaFeature(nsQuota, dsQuota);
+      }
+      return dir;
+    }
+
+    public static void updateBlocksMap(INodeFile file, BlockManager bm) {
+      // Add file->block mapping
+      final BlockInfo[] blocks = file.getBlocks();
+      if (blocks != null) {
+        for (int i = 0; i < blocks.length; i++) {
+          file.setBlock(i, bm.addBlockCollection(blocks[i], file));
+        }
+      }
+    }
+
+    private final FSDirectory dir;
+    private final FSNamesystem fsn;
+    private final FSImageFormatProtobuf.Loader parent;
+
+    Loader(FSNamesystem fsn, final FSImageFormatProtobuf.Loader parent) {
+      this.fsn = fsn;
+      this.dir = fsn.dir;
+      this.parent = parent;
+    }
+
+    void loadINodeDirectorySection(InputStream in) throws IOException {
+      final List<INodeReference> refList = parent.getLoaderContext()
+          .getRefList();
+      while (true) {
+        INodeDirectorySection.DirEntry e = INodeDirectorySection.DirEntry
+            .parseDelimitedFrom(in);
+        // note that in is a LimitedInputStream
+        if (e == null) {
+          break;
+        }
+        INodeDirectory p = dir.getInode(e.getParent()).asDirectory();
+        for (long id : e.getChildrenList()) {
+          INode child = dir.getInode(id);
+          addToParent(p, child);
+        }
+        for (int refId : e.getRefChildrenList()) {
+          INodeReference ref = refList.get(refId);
+          addToParent(p, ref);
+        }
+      }
+    }
+
+    void loadINodeSection(InputStream in) throws IOException {
+      INodeSection s = INodeSection.parseDelimitedFrom(in);
+      fsn.resetLastInodeId(s.getLastInodeId());
+      LOG.info("Loading " + s.getNumInodes() + " INodes.");
+      for (int i = 0; i < s.getNumInodes(); ++i) {
+        INodeSection.INode p = INodeSection.INode.parseDelimitedFrom(in);
+        if (p.getId() == INodeId.ROOT_INODE_ID) {
+          loadRootINode(p);
+        } else {
+          INode n = loadINode(p);
+          dir.addToInodeMap(n);
+        }
+      }
+    }
+
+    /**
+     * Load the under-construction files section, and update the lease map
+     */
+    void loadFilesUnderConstructionSection(InputStream in) throws IOException {
+      while (true) {
+        FileUnderConstructionEntry entry = FileUnderConstructionEntry
+            .parseDelimitedFrom(in);
+        if (entry == null) {
+          break;
+        }
+        // update the lease manager
+        INodeFile file = dir.getInode(entry.getInodeId()).asFile();
+        FileUnderConstructionFeature uc = file.getFileUnderConstructionFeature();
+        Preconditions.checkState(uc != null); // file must be under-construction
+        fsn.leaseManager.addLease(uc.getClientName(), entry.getFullPath());
+      }
+    }
+
+    private void addToParent(INodeDirectory parent, INode child) {
+      if (parent == dir.rootDir && FSDirectory.isReservedName(child)) {
+        throw new HadoopIllegalArgumentException("File name \""
+            + child.getLocalName() + "\" is reserved. Please "
+            + " change the name of the existing file or directory to another "
+            + "name before upgrading to this release.");
+      }
+      // NOTE: This does not update space counts for parents
+      if (!parent.addChild(child)) {
+        return;
+      }
+      dir.cacheName(child);
+
+      if (child.isFile()) {
+        updateBlocksMap(child.asFile(), fsn.getBlockManager());
+      }
+    }
+
+    private INode loadINode(INodeSection.INode n) {
+      switch (n.getType()) {
+      case FILE:
+        return loadINodeFile(n);
+      case DIRECTORY:
+        return loadINodeDirectory(n, parent.getLoaderContext().getStringTable());
+      case SYMLINK:
+        return loadINodeSymlink(n);
+      default:
+        break;
+      }
+      return null;
+    }
+
+    private INodeFile loadINodeFile(INodeSection.INode n) {
+      assert n.getType() == INodeSection.INode.Type.FILE;
+      INodeSection.INodeFile f = n.getFile();
+      List<BlockProto> bp = f.getBlocksList();
+      short replication = (short) f.getReplication();
+
+      BlockInfo[] blocks = new BlockInfo[bp.size()];
+      for (int i = 0, e = bp.size(); i < e; ++i) {
+        blocks[i] = new BlockInfo(PBHelper.convert(bp.get(i)), replication);
+      }
+      final PermissionStatus permissions = loadPermission(f.getPermission(),
+          parent.getLoaderContext().getStringTable());
+
+      final INodeFile file = new INodeFile(n.getId(),
+          n.getName().toByteArray(), permissions, f.getModificationTime(),
+          f.getAccessTime(), blocks, replication, f.getPreferredBlockSize());
+      // under-construction information
+      if (f.hasFileUC()) {
+        INodeSection.FileUnderConstructionFeature uc = f.getFileUC();
+        file.toUnderConstruction(uc.getClientName(), uc.getClientMachine(),
+            null);
+        if (blocks.length > 0) {
+          BlockInfo lastBlk = file.getLastBlock();
+          // replace the last block of file
+          file.setBlock(file.numBlocks() - 1, new BlockInfoUnderConstruction(
+              lastBlk, replication));
+        }
+      }
+      return file;
+    }
+
+
+    private INodeSymlink loadINodeSymlink(INodeSection.INode n) {
+      assert n.getType() == INodeSection.INode.Type.SYMLINK;
+      INodeSection.INodeSymlink s = n.getSymlink();
+      final PermissionStatus permissions = loadPermission(s.getPermission(),
+          parent.getLoaderContext().getStringTable());
+      return new INodeSymlink(n.getId(), n.getName().toByteArray(), permissions,
+          0, 0, s.getTarget().toStringUtf8());
+    }
+
+    private void loadRootINode(INodeSection.INode p) {
+      INodeDirectory root = loadINodeDirectory(p, parent.getLoaderContext()
+          .getStringTable());
+      final Quota.Counts q = root.getQuotaCounts();
+      final long nsQuota = q.get(Quota.NAMESPACE);
+      final long dsQuota = q.get(Quota.DISKSPACE);
+      if (nsQuota != -1 || dsQuota != -1) {
+        dir.rootDir.getDirectoryWithQuotaFeature().setQuota(nsQuota, dsQuota);
+      }
+      dir.rootDir.cloneModificationTime(root);
+      dir.rootDir.clonePermissionStatus(root);
+    }
+  }
+
+  public final static class Saver {
+    private static long buildPermissionStatus(INodeAttributes n,
+        final SaverContext.DeduplicationMap<String> stringMap) {
+      long userId = stringMap.getId(n.getUserName());
+      long groupId = stringMap.getId(n.getGroupName());
+      return ((userId & USER_GROUP_STRID_MASK) << USER_STRID_OFFSET)
+          | ((groupId & USER_GROUP_STRID_MASK) << GROUP_STRID_OFFSET)
+          | n.getFsPermissionShort();
+    }
+
+    public static INodeSection.INodeFile.Builder buildINodeFile(
+        INodeFileAttributes file,
+        final SaverContext.DeduplicationMap<String> stringMap) {
+      INodeSection.INodeFile.Builder b = INodeSection.INodeFile.newBuilder()
+          .setAccessTime(file.getAccessTime())
+          .setModificationTime(file.getModificationTime())
+          .setPermission(buildPermissionStatus(file, stringMap))
+          .setPreferredBlockSize(file.getPreferredBlockSize())
+          .setReplication(file.getFileReplication());
+      return b;
+    }
+
+    public static INodeSection.INodeDirectory.Builder buildINodeDirectory(
+        INodeDirectoryAttributes dir,
+        final SaverContext.DeduplicationMap<String> stringMap) {
+      Quota.Counts quota = dir.getQuotaCounts();
+      INodeSection.INodeDirectory.Builder b = INodeSection.INodeDirectory
+          .newBuilder().setModificationTime(dir.getModificationTime())
+          .setNsQuota(quota.get(Quota.NAMESPACE))
+          .setDsQuota(quota.get(Quota.DISKSPACE))
+          .setPermission(buildPermissionStatus(dir, stringMap));
+      return b;
+    }
+
+    private final FSNamesystem fsn;
+    private final FileSummary.Builder summary;
+    private final SaveNamespaceContext context;
+    private final FSImageFormatProtobuf.Saver parent;
+
+    Saver(FSImageFormatProtobuf.Saver parent, FileSummary.Builder summary) {
+      this.parent = parent;
+      this.summary = summary;
+      this.context = parent.getContext();
+      this.fsn = context.getSourceNamesystem();
+    }
+
+    void serializeINodeDirectorySection(OutputStream out) throws IOException {
+      Iterator<INodeWithAdditionalFields> iter = fsn.getFSDirectory()
+          .getINodeMap().getMapIterator();
+      final ArrayList<INodeReference> refList = parent.getSaverContext()
+          .getRefList();
+      int i = 0;
+      while (iter.hasNext()) {
+        INodeWithAdditionalFields n = iter.next();
+        if (!n.isDirectory()) {
+          continue;
+        }
+
+        ReadOnlyList<INode> children = n.asDirectory().getChildrenList(
+            Snapshot.CURRENT_STATE_ID);
+        if (children.size() > 0) {
+          INodeDirectorySection.DirEntry.Builder b = INodeDirectorySection.
+              DirEntry.newBuilder().setParent(n.getId());
+          for (INode inode : children) {
+            if (!inode.isReference()) {
+              b.addChildren(inode.getId());
+            } else {
+              refList.add(inode.asReference());
+              b.addRefChildren(refList.size() - 1);
+            }
+          }
+          INodeDirectorySection.DirEntry e = b.build();
+          e.writeDelimitedTo(out);
+        }
+
+        ++i;
+        if (i % FSImageFormatProtobuf.Saver.CHECK_CANCEL_INTERVAL == 0) {
+          context.checkCancelled();
+        }
+      }
+      parent.commitSection(summary,
+          FSImageFormatProtobuf.SectionName.INODE_DIR);
+    }
+
+    void serializeINodeSection(OutputStream out) throws IOException {
+      INodeMap inodesMap = fsn.dir.getINodeMap();
+
+      INodeSection.Builder b = INodeSection.newBuilder()
+          .setLastInodeId(fsn.getLastInodeId()).setNumInodes(inodesMap.size());
+      INodeSection s = b.build();
+      s.writeDelimitedTo(out);
+
+      int i = 0;
+      Iterator<INodeWithAdditionalFields> iter = inodesMap.getMapIterator();
+      while (iter.hasNext()) {
+        INodeWithAdditionalFields n = iter.next();
+        save(out, n);
+        ++i;
+        if (i % FSImageFormatProtobuf.Saver.CHECK_CANCEL_INTERVAL == 0) {
+          context.checkCancelled();
+        }
+      }
+      parent.commitSection(summary, FSImageFormatProtobuf.SectionName.INODE);
+    }
+
+    void serializeFilesUCSection(OutputStream out) throws IOException {
+      Map<String, INodeFile> ucMap = fsn.getFilesUnderConstruction();
+      for (Map.Entry<String, INodeFile> entry : ucMap.entrySet()) {
+        String path = entry.getKey();
+        INodeFile file = entry.getValue();
+        FileUnderConstructionEntry.Builder b = FileUnderConstructionEntry
+            .newBuilder().setInodeId(file.getId()).setFullPath(path);
+        FileUnderConstructionEntry e = b.build();
+        e.writeDelimitedTo(out);
+      }
+      parent.commitSection(summary,
+          FSImageFormatProtobuf.SectionName.FILES_UNDERCONSTRUCTION);
+    }
+
+    private void save(OutputStream out, INode n) throws IOException {
+      if (n.isDirectory()) {
+        save(out, n.asDirectory());
+      } else if (n.isFile()) {
+        save(out, n.asFile());
+      } else if (n.isSymlink()) {
+        save(out, n.asSymlink());
+      }
+    }
+
+    private void save(OutputStream out, INodeDirectory n) throws IOException {
+      INodeSection.INodeDirectory.Builder b = buildINodeDirectory(n,
+          parent.getSaverContext().getStringMap());
+      INodeSection.INode r = buildINodeCommon(n)
+          .setType(INodeSection.INode.Type.DIRECTORY).setDirectory(b).build();
+      r.writeDelimitedTo(out);
+    }
+
+    private void save(OutputStream out, INodeFile n) throws IOException {
+      INodeSection.INodeFile.Builder b = buildINodeFile(n,
+          parent.getSaverContext().getStringMap());
+
+      for (Block block : n.getBlocks()) {
+        b.addBlocks(PBHelper.convert(block));
+      }
+
+      FileUnderConstructionFeature uc = n.getFileUnderConstructionFeature();
+      if (uc != null) {
+        INodeSection.FileUnderConstructionFeature f =
+            INodeSection.FileUnderConstructionFeature
+            .newBuilder().setClientName(uc.getClientName())
+            .setClientMachine(uc.getClientMachine()).build();
+        b.setFileUC(f);
+      }
+
+      INodeSection.INode r = buildINodeCommon(n)
+          .setType(INodeSection.INode.Type.FILE).setFile(b).build();
+      r.writeDelimitedTo(out);
+    }
+
+    private void save(OutputStream out, INodeSymlink n) throws IOException {
+      INodeSection.INodeSymlink.Builder b = INodeSection.INodeSymlink
+          .newBuilder()
+          .setPermission(buildPermissionStatus(n, parent.getSaverContext().getStringMap()))
+          .setTarget(ByteString.copyFrom(n.getSymlink()));
+      INodeSection.INode r = buildINodeCommon(n)
+          .setType(INodeSection.INode.Type.SYMLINK).setSymlink(b).build();
+      r.writeDelimitedTo(out);
+    }
+
+    private final INodeSection.INode.Builder buildINodeCommon(INode n) {
+      return INodeSection.INode.newBuilder()
+          .setId(n.getId())
+          .setName(ByteString.copyFrom(n.getLocalNameBytes()));
+    }
+  }
+
+  private FSImageFormatPBINode() {
+  }
+}

+ 583 - 0
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSImageFormatProtobuf.java

@@ -0,0 +1,583 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hdfs.server.namenode;
+
+import java.io.BufferedInputStream;
+import java.io.BufferedOutputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.io.RandomAccessFile;
+import java.nio.ByteBuffer;
+import java.nio.channels.FileChannel;
+import java.security.DigestOutputStream;
+import java.security.MessageDigest;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.Set;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hdfs.protocol.LayoutVersion;
+import org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos.CacheDirectiveInfoProto;
+import org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos.CachePoolInfoProto;
+import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenSecretManager;
+import org.apache.hadoop.hdfs.server.namenode.FsImageProto.CacheManagerSection;
+import org.apache.hadoop.hdfs.server.namenode.FsImageProto.FileSummary;
+import org.apache.hadoop.hdfs.server.namenode.FsImageProto.NameSystemSection;
+import org.apache.hadoop.hdfs.server.namenode.FsImageProto.SecretManagerSection;
+import org.apache.hadoop.hdfs.server.namenode.FsImageProto.StringTableSection;
+import org.apache.hadoop.hdfs.server.namenode.snapshot.FSImageFormatPBSnapshot;
+import org.apache.hadoop.hdfs.server.namenode.startupprogress.Phase;
+import org.apache.hadoop.hdfs.server.namenode.startupprogress.StartupProgress;
+import org.apache.hadoop.hdfs.server.namenode.startupprogress.Step;
+import org.apache.hadoop.hdfs.server.namenode.startupprogress.StepType;
+import org.apache.hadoop.hdfs.util.MD5FileUtils;
+import org.apache.hadoop.io.MD5Hash;
+import org.apache.hadoop.io.compress.CompressionCodec;
+import org.apache.hadoop.io.compress.CompressorStream;
+
+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
+import com.google.common.io.LimitInputStream;
+import com.google.protobuf.CodedOutputStream;
+
+/**
+ * Utility class to read / write fsimage in protobuf format.
+ */
+@InterfaceAudience.Private
+public final class FSImageFormatProtobuf {
+  private static final Log LOG = LogFactory.getLog(FSImageFormatProtobuf.class);
+
+  public static final class LoaderContext {
+    private String[] stringTable;
+    private final ArrayList<INodeReference> refList = Lists.newArrayList();
+
+    public String[] getStringTable() {
+      return stringTable;
+    }
+
+    public ArrayList<INodeReference> getRefList() {
+      return refList;
+    }
+  }
+
+  public static final class SaverContext {
+    public static class DeduplicationMap<E> {
+      private final Map<E, Integer> map = Maps.newHashMap();
+      private DeduplicationMap() {}
+
+      static <T> DeduplicationMap<T> newMap() {
+        return new DeduplicationMap<T>();
+      }
+
+      int getId(E value) {
+        if (value == null) {
+          return 0;
+        }
+        Integer v = map.get(value);
+        if (v == null) {
+          int nv = map.size() + 1;
+          map.put(value, nv);
+          return nv;
+        }
+        return v;
+      }
+
+      int size() {
+        return map.size();
+      }
+
+      Set<Entry<E, Integer>> entrySet() {
+        return map.entrySet();
+      }
+    }
+    private final DeduplicationMap<String> stringMap = DeduplicationMap.newMap();
+    private final ArrayList<INodeReference> refList = Lists.newArrayList();
+
+    public DeduplicationMap<String> getStringMap() {
+      return stringMap;
+    }
+
+    public ArrayList<INodeReference> getRefList() {
+      return refList;
+    }
+  }
+
+  public static final class Loader implements FSImageFormat.AbstractLoader {
+    static final int MINIMUM_FILE_LENGTH = 8;
+    private final Configuration conf;
+    private final FSNamesystem fsn;
+    private final LoaderContext ctx;
+    /** The MD5 sum of the loaded file */
+    private MD5Hash imgDigest;
+    /** The transaction ID of the last edit represented by the loaded file */
+    private long imgTxId;
+
+    Loader(Configuration conf, FSNamesystem fsn) {
+      this.conf = conf;
+      this.fsn = fsn;
+      this.ctx = new LoaderContext();
+    }
+
+    @Override
+    public MD5Hash getLoadedImageMd5() {
+      return imgDigest;
+    }
+
+    @Override
+    public long getLoadedImageTxId() {
+      return imgTxId;
+    }
+
+    public LoaderContext getLoaderContext() {
+      return ctx;
+    }
+
+    void load(File file) throws IOException {
+      long start = System.currentTimeMillis();
+      imgDigest = MD5FileUtils.computeMd5ForFile(file);
+      RandomAccessFile raFile = new RandomAccessFile(file, "r");
+      FileInputStream fin = new FileInputStream(file);
+      try {
+        loadInternal(raFile, fin);
+        long end = System.currentTimeMillis();
+        LOG.info("Loaded FSImage in " + (end - start) / 1000 + " seconds.");
+      } finally {
+        fin.close();
+        raFile.close();
+      }
+    }
+
+    private void loadInternal(RandomAccessFile raFile, FileInputStream fin)
+        throws IOException {
+      if (!FSImageUtil.checkFileFormat(raFile)) {
+        throw new IOException("Unrecognized file format");
+      }
+      FileSummary summary = FSImageUtil.loadSummary(raFile);
+
+      FileChannel channel = fin.getChannel();
+
+      FSImageFormatPBINode.Loader inodeLoader = new FSImageFormatPBINode.Loader(
+          fsn, this);
+      FSImageFormatPBSnapshot.Loader snapshotLoader = new FSImageFormatPBSnapshot.Loader(
+          fsn, this);
+
+      ArrayList<FileSummary.Section> sections = Lists.newArrayList(summary
+          .getSectionsList());
+      Collections.sort(sections, new Comparator<FileSummary.Section>() {
+        @Override
+        public int compare(FileSummary.Section s1, FileSummary.Section s2) {
+          SectionName n1 = SectionName.fromString(s1.getName());
+          SectionName n2 = SectionName.fromString(s2.getName());
+          if (n1 == null) {
+            return n2 == null ? 0 : -1;
+          } else if (n2 == null) {
+            return -1;
+          } else {
+            return n1.ordinal() - n2.ordinal();
+          }
+        }
+      });
+
+      StartupProgress prog = NameNode.getStartupProgress();
+      /**
+       * beginStep() and the endStep() calls do not match the boundary of the
+       * sections. This is because that the current implementation only allows
+       * a particular step to be started for once.
+       */
+      Step currentStep = null;
+
+      for (FileSummary.Section s : sections) {
+        channel.position(s.getOffset());
+        InputStream in = new BufferedInputStream(new LimitInputStream(fin,
+            s.getLength()));
+
+        in = FSImageUtil.wrapInputStreamForCompression(conf,
+            summary.getCodec(), in);
+
+        String n = s.getName();
+
+        switch (SectionName.fromString(n)) {
+        case NS_INFO:
+          loadNameSystemSection(in);
+          break;
+        case STRING_TABLE:
+          loadStringTableSection(in);
+          break;
+        case INODE: {
+          currentStep = new Step(StepType.INODES);
+          prog.beginStep(Phase.LOADING_FSIMAGE, currentStep);
+          inodeLoader.loadINodeSection(in);
+        }
+          break;
+        case INODE_REFRENCE:
+          snapshotLoader.loadINodeReferenceSection(in);
+          break;
+        case INODE_DIR:
+          inodeLoader.loadINodeDirectorySection(in);
+          break;
+        case FILES_UNDERCONSTRUCTION:
+          inodeLoader.loadFilesUnderConstructionSection(in);
+          break;
+        case SNAPSHOT:
+          snapshotLoader.loadSnapshotSection(in);
+          break;
+        case SNAPSHOT_DIFF:
+          snapshotLoader.loadSnapshotDiffSection(in);
+          break;
+        case SECRET_MANAGER: {
+          prog.endStep(Phase.LOADING_FSIMAGE, currentStep);
+          Step step = new Step(StepType.DELEGATION_TOKENS);
+          prog.beginStep(Phase.LOADING_FSIMAGE, step);
+          loadSecretManagerSection(in);
+          prog.endStep(Phase.LOADING_FSIMAGE, step);
+        }
+          break;
+        case CACHE_MANAGER: {
+          Step step = new Step(StepType.CACHE_POOLS);
+          prog.beginStep(Phase.LOADING_FSIMAGE, step);
+          loadCacheManagerSection(in);
+          prog.endStep(Phase.LOADING_FSIMAGE, step);
+        }
+          break;
+        default:
+          LOG.warn("Unregconized section " + n);
+          break;
+        }
+      }
+    }
+
+    private void loadNameSystemSection(InputStream in) throws IOException {
+      NameSystemSection s = NameSystemSection.parseDelimitedFrom(in);
+      fsn.setGenerationStampV1(s.getGenstampV1());
+      fsn.setGenerationStampV2(s.getGenstampV2());
+      fsn.setGenerationStampV1Limit(s.getGenstampV1Limit());
+      fsn.setLastAllocatedBlockId(s.getLastAllocatedBlockId());
+      imgTxId = s.getTransactionId();
+    }
+
+    private void loadStringTableSection(InputStream in) throws IOException {
+      StringTableSection s = StringTableSection.parseDelimitedFrom(in);
+      ctx.stringTable = new String[s.getNumEntry() + 1];
+      for (int i = 0; i < s.getNumEntry(); ++i) {
+        StringTableSection.Entry e = StringTableSection.Entry
+            .parseDelimitedFrom(in);
+        ctx.stringTable[e.getId()] = e.getStr();
+      }
+    }
+
+    private void loadSecretManagerSection(InputStream in) throws IOException {
+      SecretManagerSection s = SecretManagerSection.parseDelimitedFrom(in);
+      int numKeys = s.getNumKeys(), numTokens = s.getNumTokens();
+      ArrayList<SecretManagerSection.DelegationKey> keys = Lists
+          .newArrayListWithCapacity(numKeys);
+      ArrayList<SecretManagerSection.PersistToken> tokens = Lists
+          .newArrayListWithCapacity(numTokens);
+
+      for (int i = 0; i < numKeys; ++i)
+        keys.add(SecretManagerSection.DelegationKey.parseDelimitedFrom(in));
+
+      for (int i = 0; i < numTokens; ++i)
+        tokens.add(SecretManagerSection.PersistToken.parseDelimitedFrom(in));
+
+      fsn.loadSecretManagerState(s, keys, tokens);
+    }
+
+    private void loadCacheManagerSection(InputStream in) throws IOException {
+      CacheManagerSection s = CacheManagerSection.parseDelimitedFrom(in);
+      ArrayList<CachePoolInfoProto> pools = Lists.newArrayListWithCapacity(s
+          .getNumPools());
+      ArrayList<CacheDirectiveInfoProto> directives = Lists
+          .newArrayListWithCapacity(s.getNumDirectives());
+      for (int i = 0; i < s.getNumPools(); ++i)
+        pools.add(CachePoolInfoProto.parseDelimitedFrom(in));
+      for (int i = 0; i < s.getNumDirectives(); ++i)
+        directives.add(CacheDirectiveInfoProto.parseDelimitedFrom(in));
+      fsn.getCacheManager().loadState(
+          new CacheManager.PersistState(s, pools, directives));
+    }
+
+  }
+
+  public static final class Saver {
+    public static final int CHECK_CANCEL_INTERVAL = 4096;
+
+    private final SaveNamespaceContext context;
+    private final SaverContext saverContext;
+    private long currentOffset = FSImageUtil.MAGIC_HEADER.length;
+    private MD5Hash savedDigest;
+
+    private FileChannel fileChannel;
+    // OutputStream for the section data
+    private OutputStream sectionOutputStream;
+    private CompressionCodec codec;
+    private OutputStream underlyingOutputStream;
+
+    Saver(SaveNamespaceContext context) {
+      this.context = context;
+      this.saverContext = new SaverContext();
+    }
+
+    public MD5Hash getSavedDigest() {
+      return savedDigest;
+    }
+
+    public SaveNamespaceContext getContext() {
+      return context;
+    }
+
+    public SaverContext getSaverContext() {
+      return saverContext;
+    }
+
+    public void commitSection(FileSummary.Builder summary, SectionName name)
+        throws IOException {
+      long oldOffset = currentOffset;
+      flushSectionOutputStream();
+
+      if (codec != null) {
+        sectionOutputStream = codec.createOutputStream(underlyingOutputStream);
+      } else {
+        sectionOutputStream = underlyingOutputStream;
+      }
+      long length = fileChannel.position() - oldOffset;
+      summary.addSections(FileSummary.Section.newBuilder().setName(name.name)
+          .setLength(length).setOffset(currentOffset));
+      currentOffset += length;
+    }
+
+    private void flushSectionOutputStream() throws IOException {
+      if (codec != null) {
+        ((CompressorStream) sectionOutputStream).finish();
+      }
+      sectionOutputStream.flush();
+    }
+
+    void save(File file, FSImageCompression compression) throws IOException {
+      FileOutputStream fout = new FileOutputStream(file);
+      fileChannel = fout.getChannel();
+      try {
+        saveInternal(fout, compression, file.getAbsolutePath().toString());
+      } finally {
+        fout.close();
+      }
+    }
+
+    private static void saveFileSummary(OutputStream out, FileSummary summary)
+        throws IOException {
+      summary.writeDelimitedTo(out);
+      int length = getOndiskTrunkSize(summary);
+      byte[] lengthBytes = new byte[4];
+      ByteBuffer.wrap(lengthBytes).asIntBuffer().put(length);
+      out.write(lengthBytes);
+    }
+
+    private void saveInodes(FileSummary.Builder summary) throws IOException {
+      FSImageFormatPBINode.Saver saver = new FSImageFormatPBINode.Saver(this,
+          summary);
+
+      saver.serializeINodeSection(sectionOutputStream);
+      saver.serializeINodeDirectorySection(sectionOutputStream);
+      saver.serializeFilesUCSection(sectionOutputStream);
+    }
+
+    private void saveSnapshots(FileSummary.Builder summary) throws IOException {
+      FSImageFormatPBSnapshot.Saver snapshotSaver = new FSImageFormatPBSnapshot.Saver(
+          this, summary, context, context.getSourceNamesystem());
+
+      snapshotSaver.serializeSnapshotSection(sectionOutputStream);
+      snapshotSaver.serializeSnapshotDiffSection(sectionOutputStream);
+      snapshotSaver.serializeINodeReferenceSection(sectionOutputStream);
+    }
+
+    private void saveInternal(FileOutputStream fout,
+        FSImageCompression compression, String filePath) throws IOException {
+      StartupProgress prog = NameNode.getStartupProgress();
+      MessageDigest digester = MD5Hash.getDigester();
+
+      underlyingOutputStream = new DigestOutputStream(new BufferedOutputStream(
+          fout), digester);
+      underlyingOutputStream.write(FSImageUtil.MAGIC_HEADER);
+
+      fileChannel = fout.getChannel();
+
+      FileSummary.Builder b = FileSummary.newBuilder()
+          .setOndiskVersion(FSImageUtil.FILE_VERSION)
+          .setLayoutVersion(LayoutVersion.getCurrentLayoutVersion());
+
+      codec = compression.getImageCodec();
+      if (codec != null) {
+        b.setCodec(codec.getClass().getCanonicalName());
+        sectionOutputStream = codec.createOutputStream(underlyingOutputStream);
+      } else {
+        sectionOutputStream = underlyingOutputStream;
+      }
+
+      saveNameSystemSection(b);
+      // Check for cancellation right after serializing the name system section.
+      // Some unit tests, such as TestSaveNamespace#testCancelSaveNameSpace
+      // depends on this behavior.
+      context.checkCancelled();
+
+      Step step = new Step(StepType.INODES, filePath);
+      prog.beginStep(Phase.SAVING_CHECKPOINT, step);
+      saveInodes(b);
+      saveSnapshots(b);
+      prog.endStep(Phase.SAVING_CHECKPOINT, step);
+
+      step = new Step(StepType.DELEGATION_TOKENS, filePath);
+      prog.beginStep(Phase.SAVING_CHECKPOINT, step);
+      saveSecretManagerSection(b);
+      prog.endStep(Phase.SAVING_CHECKPOINT, step);
+
+      step = new Step(StepType.CACHE_POOLS, filePath);
+      prog.beginStep(Phase.SAVING_CHECKPOINT, step);
+      saveCacheManagerSection(b);
+      prog.endStep(Phase.SAVING_CHECKPOINT, step);
+
+      saveStringTableSection(b);
+
+      // We use the underlyingOutputStream to write the header. Therefore flush
+      // the buffered stream (which is potentially compressed) first.
+      flushSectionOutputStream();
+
+      FileSummary summary = b.build();
+      saveFileSummary(underlyingOutputStream, summary);
+      underlyingOutputStream.close();
+      savedDigest = new MD5Hash(digester.digest());
+    }
+
+    private void saveSecretManagerSection(FileSummary.Builder summary)
+        throws IOException {
+      final FSNamesystem fsn = context.getSourceNamesystem();
+      DelegationTokenSecretManager.SecretManagerState state = fsn
+          .saveSecretManagerState();
+      state.section.writeDelimitedTo(sectionOutputStream);
+      for (SecretManagerSection.DelegationKey k : state.keys)
+        k.writeDelimitedTo(sectionOutputStream);
+
+      for (SecretManagerSection.PersistToken t : state.tokens)
+        t.writeDelimitedTo(sectionOutputStream);
+
+      commitSection(summary, SectionName.SECRET_MANAGER);
+    }
+
+    private void saveCacheManagerSection(FileSummary.Builder summary)
+        throws IOException {
+      final FSNamesystem fsn = context.getSourceNamesystem();
+      CacheManager.PersistState state = fsn.getCacheManager().saveState();
+      state.section.writeDelimitedTo(sectionOutputStream);
+
+      for (CachePoolInfoProto p : state.pools)
+        p.writeDelimitedTo(sectionOutputStream);
+
+      for (CacheDirectiveInfoProto p : state.directives)
+        p.writeDelimitedTo(sectionOutputStream);
+
+      commitSection(summary, SectionName.CACHE_MANAGER);
+    }
+
+    private void saveNameSystemSection(FileSummary.Builder summary)
+        throws IOException {
+      final FSNamesystem fsn = context.getSourceNamesystem();
+      OutputStream out = sectionOutputStream;
+      NameSystemSection.Builder b = NameSystemSection.newBuilder()
+          .setGenstampV1(fsn.getGenerationStampV1())
+          .setGenstampV1Limit(fsn.getGenerationStampV1Limit())
+          .setGenstampV2(fsn.getGenerationStampV2())
+          .setLastAllocatedBlockId(fsn.getLastAllocatedBlockId())
+          .setTransactionId(context.getTxId());
+
+      // We use the non-locked version of getNamespaceInfo here since
+      // the coordinating thread of saveNamespace already has read-locked
+      // the namespace for us. If we attempt to take another readlock
+      // from the actual saver thread, there's a potential of a
+      // fairness-related deadlock. See the comments on HDFS-2223.
+      b.setNamespaceId(fsn.unprotectedGetNamespaceInfo().getNamespaceID());
+      NameSystemSection s = b.build();
+      s.writeDelimitedTo(out);
+
+      commitSection(summary, SectionName.NS_INFO);
+    }
+
+    private void saveStringTableSection(FileSummary.Builder summary)
+        throws IOException {
+      OutputStream out = sectionOutputStream;
+      StringTableSection.Builder b = StringTableSection.newBuilder()
+          .setNumEntry(saverContext.stringMap.size());
+      b.build().writeDelimitedTo(out);
+      for (Entry<String, Integer> e : saverContext.stringMap.entrySet()) {
+        StringTableSection.Entry.Builder eb = StringTableSection.Entry
+            .newBuilder().setId(e.getValue()).setStr(e.getKey());
+        eb.build().writeDelimitedTo(out);
+      }
+      commitSection(summary, SectionName.STRING_TABLE);
+    }
+  }
+
+  /**
+   * Supported section name. The order of the enum determines the order of
+   * loading.
+   */
+  public enum SectionName {
+    NS_INFO("NS_INFO"),
+    STRING_TABLE("STRING_TABLE"),
+    INODE("INODE"),
+    INODE_REFRENCE("INODE_REFRENCE"),
+    SNAPSHOT("SNAPSHOT"),
+    INODE_DIR("INODE_DIR"),
+    FILES_UNDERCONSTRUCTION("FILES_UNDERCONSTRUCTION"),
+    SNAPSHOT_DIFF("SNAPSHOT_DIFF"),
+    SECRET_MANAGER("SECRET_MANAGER"),
+    CACHE_MANAGER("CACHE_MANAGER");
+
+    private static final SectionName[] values = SectionName.values();
+
+    public static SectionName fromString(String name) {
+      for (SectionName n : values) {
+        if (n.name.equals(name))
+          return n;
+      }
+      return null;
+    }
+
+    private final String name;
+
+    private SectionName(String name) {
+      this.name = name;
+    }
+  }
+
+  private static int getOndiskTrunkSize(com.google.protobuf.GeneratedMessage s) {
+    return CodedOutputStream.computeRawVarint32Size(s.getSerializedSize())
+        + s.getSerializedSize();
+  }
+
+  private FSImageFormatProtobuf() {
+  }
+}

+ 93 - 0
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSImageUtil.java

@@ -0,0 +1,93 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs.server.namenode;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.RandomAccessFile;
+import java.util.Arrays;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hdfs.protocol.LayoutVersion;
+import org.apache.hadoop.hdfs.protocol.LayoutVersion.Feature;
+import org.apache.hadoop.hdfs.server.namenode.FSImageFormatProtobuf.Loader;
+import org.apache.hadoop.hdfs.server.namenode.FsImageProto.FileSummary;
+import org.apache.hadoop.io.compress.CompressionCodec;
+
+@InterfaceAudience.Private
+public final class FSImageUtil {
+  public static final byte[] MAGIC_HEADER = "HDFSIMG1".getBytes();
+  public static final int FILE_VERSION = 1;
+
+  public static boolean checkFileFormat(RandomAccessFile file)
+      throws IOException {
+    if (file.length() < Loader.MINIMUM_FILE_LENGTH)
+      return false;
+
+    byte[] magic = new byte[MAGIC_HEADER.length];
+    file.readFully(magic);
+    if (!Arrays.equals(MAGIC_HEADER, magic))
+      return false;
+
+    return true;
+  }
+
+  public static FileSummary loadSummary(RandomAccessFile file)
+      throws IOException {
+    final int FILE_LENGTH_FIELD_SIZE = 4;
+    long fileLength = file.length();
+    file.seek(fileLength - FILE_LENGTH_FIELD_SIZE);
+    int summaryLength = file.readInt();
+
+    if (summaryLength <= 0) {
+      throw new IOException("Negative length of the file");
+    }
+    file.seek(fileLength - FILE_LENGTH_FIELD_SIZE - summaryLength);
+
+    byte[] summaryBytes = new byte[summaryLength];
+    file.readFully(summaryBytes);
+
+    FileSummary summary = FileSummary
+        .parseDelimitedFrom(new ByteArrayInputStream(summaryBytes));
+    if (summary.getOndiskVersion() != FILE_VERSION) {
+      throw new IOException("Unsupported file version "
+          + summary.getOndiskVersion());
+    }
+
+    if (!LayoutVersion.supports(Feature.PROTOBUF_FORMAT,
+        summary.getLayoutVersion())) {
+      throw new IOException("Unsupported layout version "
+          + summary.getLayoutVersion());
+    }
+    return summary;
+  }
+
+  public static InputStream wrapInputStreamForCompression(
+      Configuration conf, String codec, InputStream in) throws IOException {
+    if (codec.isEmpty())
+      return in;
+
+    FSImageCompression compression = FSImageCompression.createCompression(
+        conf, codec);
+    CompressionCodec imageCodec = compression.getImageCodec();
+    return imageCodec.createInputStream(in);
+  }
+
+}

+ 28 - 1
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java

@@ -181,6 +181,7 @@ import org.apache.hadoop.hdfs.security.token.block.BlockTokenSecretManager;
 import org.apache.hadoop.hdfs.security.token.block.BlockTokenSecretManager.AccessMode;
 import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenIdentifier;
 import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenSecretManager;
+import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenSecretManager.SecretManagerState;
 import org.apache.hadoop.hdfs.server.blockmanagement.BlockCollection;
 import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfo;
 import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfoUnderConstruction;
@@ -198,6 +199,8 @@ import org.apache.hadoop.hdfs.server.common.Storage;
 import org.apache.hadoop.hdfs.server.common.Storage.StorageDirType;
 import org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory;
 import org.apache.hadoop.hdfs.server.common.Util;
+import org.apache.hadoop.hdfs.server.namenode.FsImageProto.SecretManagerSection;
+import org.apache.hadoop.hdfs.server.namenode.FsImageProto.SecretManagerSection.PersistToken;
 import org.apache.hadoop.hdfs.server.namenode.INode.BlocksMapUpdateInfo;
 import org.apache.hadoop.hdfs.server.namenode.JournalSet.JournalAndStream;
 import org.apache.hadoop.hdfs.server.namenode.LeaseManager.Lease;
@@ -6006,6 +6009,15 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
     }
   }
 
+  /**
+   * @return all the under-construction files in the lease map
+   */
+  Map<String, INodeFile> getFilesUnderConstruction() {
+    synchronized (leaseManager) {
+      return leaseManager.getINodesUnderConstruction();
+    }
+  }
+
   /**
    * Register a Backup name-node, verifying that it belongs
    * to the correct namespace, and adding it to the set of
@@ -6282,6 +6294,10 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
     dtSecretManager.saveSecretManagerStateCompat(out, sdPath);
   }
 
+  SecretManagerState saveSecretManagerState() {
+    return dtSecretManager.saveSecretManagerState();
+  }
+
   /**
    * @param in load the state of secret manager from input stream
    */
@@ -6289,6 +6305,12 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
     dtSecretManager.loadSecretManagerStateCompat(in);
   }
 
+  void loadSecretManagerState(SecretManagerSection s,
+      List<SecretManagerSection.DelegationKey> keys,
+      List<SecretManagerSection.PersistToken> tokens) throws IOException {
+    dtSecretManager.loadSecretManagerState(new SecretManagerState(s, keys, tokens));
+  }
+
   /**
    * Log the updateMasterKey operation to edit logs
    * 
@@ -6814,7 +6836,12 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
     return this.blockManager.getDatanodeManager()
         .shouldAvoidStaleDataNodesForWrite();
   }
-  
+
+  @Override // FSClusterStats
+  public int getNumDatanodesInService() {
+    return getNumLiveDataNodes() - getNumDecomLiveDataNodes();
+  }
+
   public SnapshotManager getSnapshotManager() {
     return snapshotManager;
   }

+ 1 - 1
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/INodeDirectory.java

@@ -171,7 +171,7 @@ public class INodeDirectory extends INodeWithAdditionalFields
     return children == null? -1: Collections.binarySearch(children, name);
   }
   
-  protected DirectoryWithSnapshotFeature addSnapshotFeature(
+  public DirectoryWithSnapshotFeature addSnapshotFeature(
       DirectoryDiffList diffs) {
     Preconditions.checkState(!isWithSnapshot(), 
         "Directory is already with snapshot");

+ 1 - 1
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/INodeFile.java

@@ -252,7 +252,7 @@ public class INodeFile extends INodeWithAdditionalFields
   
   /* Start of Snapshot Feature */
 
-  private FileWithSnapshotFeature addSnapshotFeature(FileDiffList diffs) {
+  public FileWithSnapshotFeature addSnapshotFeature(FileDiffList diffs) {
     Preconditions.checkState(!isWithSnapshot(), 
         "File is already with snapshot");
     FileWithSnapshotFeature sf = new FileWithSnapshotFeature(diffs);

+ 5 - 0
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/INodeMap.java

@@ -17,6 +17,7 @@
  */
 package org.apache.hadoop.hdfs.server.namenode;
 
+import java.util.Iterator;
 import java.util.List;
 
 import org.apache.hadoop.fs.permission.FsPermission;
@@ -46,6 +47,10 @@ public class INodeMap {
   /** Synchronized by external lock. */
   private final GSet<INode, INodeWithAdditionalFields> map;
   
+  public Iterator<INodeWithAdditionalFields> getMapIterator() {
+    return map.iterator();
+  }
+
   private INodeMap(GSet<INode, INodeWithAdditionalFields> map) {
     Preconditions.checkArgument(map != null);
     this.map = map;

+ 27 - 38
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNode.java

@@ -17,39 +17,22 @@
  */
 package org.apache.hadoop.hdfs.server.namenode;
 
-import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.FS_DEFAULT_NAME_KEY;
-import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.FS_TRASH_INTERVAL_DEFAULT;
-import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.FS_TRASH_INTERVAL_KEY;
-
-import java.io.IOException;
-import java.io.PrintStream;
-import java.net.InetSocketAddress;
-import java.net.URI;
-import java.security.PrivilegedExceptionAction;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collection;
-import java.util.List;
-
-import javax.management.ObjectName;
-
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.base.Joiner;
+import com.google.common.base.Preconditions;
+import com.google.common.collect.Lists;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.HadoopIllegalArgumentException;
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Trash;
 import org.apache.hadoop.ha.HAServiceProtocol.HAServiceState;
 import org.apache.hadoop.ha.HAServiceProtocol.StateChangeRequestInfo;
 import org.apache.hadoop.ha.HAServiceStatus;
 import org.apache.hadoop.ha.HealthCheckFailedException;
 import org.apache.hadoop.ha.ServiceFailedException;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Trash;
-
-import static org.apache.hadoop.hdfs.DFSConfigKeys.*;
-import static org.apache.hadoop.util.ExitUtil.terminate;
-import static org.apache.hadoop.util.ToolRunner.confirmPrompt;
-
 import org.apache.hadoop.hdfs.DFSConfigKeys;
 import org.apache.hadoop.hdfs.DFSUtil;
 import org.apache.hadoop.hdfs.HAUtil;
@@ -58,20 +41,11 @@ import org.apache.hadoop.hdfs.protocol.ClientProtocol;
 import org.apache.hadoop.hdfs.protocol.HdfsConstants;
 import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.NamenodeRole;
 import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.StartupOption;
-import org.apache.hadoop.hdfs.server.namenode.ha.ActiveState;
-import org.apache.hadoop.hdfs.server.namenode.ha.BootstrapStandby;
-import org.apache.hadoop.hdfs.server.namenode.ha.HAContext;
-import org.apache.hadoop.hdfs.server.namenode.ha.HAState;
-import org.apache.hadoop.hdfs.server.namenode.ha.StandbyState;
+import org.apache.hadoop.hdfs.server.namenode.ha.*;
 import org.apache.hadoop.hdfs.server.namenode.metrics.NameNodeMetrics;
 import org.apache.hadoop.hdfs.server.namenode.startupprogress.StartupProgress;
 import org.apache.hadoop.hdfs.server.namenode.startupprogress.StartupProgressMetrics;
-import org.apache.hadoop.hdfs.server.protocol.DatanodeProtocol;
-import org.apache.hadoop.hdfs.server.protocol.JournalProtocol;
-import org.apache.hadoop.hdfs.server.protocol.NamenodeProtocol;
-import org.apache.hadoop.hdfs.server.protocol.NamenodeProtocols;
-import org.apache.hadoop.hdfs.server.protocol.NamenodeRegistration;
-import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo;
+import org.apache.hadoop.hdfs.server.protocol.*;
 import org.apache.hadoop.ipc.Server;
 import org.apache.hadoop.ipc.StandbyException;
 import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
@@ -89,10 +63,23 @@ import org.apache.hadoop.util.JvmPauseMonitor;
 import org.apache.hadoop.util.ServicePlugin;
 import org.apache.hadoop.util.StringUtils;
 
-import com.google.common.annotations.VisibleForTesting;
-import com.google.common.base.Joiner;
-import com.google.common.base.Preconditions;
-import com.google.common.collect.Lists;
+import javax.management.ObjectName;
+import java.io.IOException;
+import java.io.PrintStream;
+import java.net.InetSocketAddress;
+import java.net.URI;
+import java.security.PrivilegedExceptionAction;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.List;
+
+import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.FS_DEFAULT_NAME_KEY;
+import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.FS_TRASH_INTERVAL_DEFAULT;
+import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.FS_TRASH_INTERVAL_KEY;
+import static org.apache.hadoop.hdfs.DFSConfigKeys.*;
+import static org.apache.hadoop.util.ExitUtil.terminate;
+import static org.apache.hadoop.util.ToolRunner.confirmPrompt;
 
 /**********************************************************
  * NameNode serves as both directory namespace manager and
@@ -183,8 +170,10 @@ public class NameNode implements NameNodeStatusMXBean {
     DFS_NAMENODE_SERVICE_RPC_ADDRESS_KEY,
     DFS_NAMENODE_SERVICE_RPC_BIND_HOST_KEY,
     DFS_NAMENODE_HTTP_ADDRESS_KEY,
+    DFS_NAMENODE_HTTPS_ADDRESS_KEY,
     DFS_NAMENODE_KEYTAB_FILE_KEY,
     DFS_NAMENODE_SECONDARY_HTTP_ADDRESS_KEY,
+    DFS_NAMENODE_SECONDARY_HTTPS_ADDRESS_KEY,
     DFS_SECONDARY_NAMENODE_KEYTAB_FILE_KEY,
     DFS_NAMENODE_BACKUP_ADDRESS_KEY,
     DFS_NAMENODE_BACKUP_HTTP_ADDRESS_KEY,

+ 43 - 26
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NamenodeFsck.java

@@ -32,6 +32,7 @@ import java.util.List;
 import java.util.Map;
 import java.util.TreeSet;
 
+import org.apache.commons.io.IOUtils;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.classification.InterfaceAudience;
@@ -40,9 +41,12 @@ import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.UnresolvedLinkException;
 import org.apache.hadoop.hdfs.BlockReader;
 import org.apache.hadoop.hdfs.BlockReaderFactory;
+import org.apache.hadoop.hdfs.ClientContext;
 import org.apache.hadoop.hdfs.DFSClient;
 import org.apache.hadoop.hdfs.DFSConfigKeys;
 import org.apache.hadoop.hdfs.DFSUtil;
+import org.apache.hadoop.hdfs.RemotePeerFactory;
+import org.apache.hadoop.hdfs.net.Peer;
 import org.apache.hadoop.hdfs.net.TcpPeerServer;
 import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
 import org.apache.hadoop.hdfs.protocol.DirectoryListing;
@@ -569,11 +573,10 @@ public class NamenodeFsck {
     int failures = 0;
     InetSocketAddress targetAddr = null;
     TreeSet<DatanodeInfo> deadNodes = new TreeSet<DatanodeInfo>();
-    Socket s = null;
     BlockReader blockReader = null; 
     ExtendedBlock block = lblock.getBlock(); 
 
-    while (s == null) {
+    while (blockReader == null) {
       DatanodeInfo chosenNode;
       
       try {
@@ -593,34 +596,47 @@ public class NamenodeFsck {
         continue;
       }
       try {
-        s = NetUtils.getDefaultSocketFactory(conf).createSocket();
-        s.connect(targetAddr, HdfsServerConstants.READ_TIMEOUT);
-        s.setSoTimeout(HdfsServerConstants.READ_TIMEOUT);
-        
-        String file = BlockReaderFactory.getFileName(targetAddr, block.getBlockPoolId(),
-            block.getBlockId());
-        blockReader = BlockReaderFactory.newBlockReader(dfs.getConf(),
-            file, block, lblock.getBlockToken(), 0, -1, true, "fsck",
-            TcpPeerServer.peerFromSocketAndKey(s, namenode.getRpcServer().
-                getDataEncryptionKey()), chosenNode, null, null, null, 
-                false, CachingStrategy.newDropBehind());
-        
+        String file = BlockReaderFactory.getFileName(targetAddr,
+            block.getBlockPoolId(), block.getBlockId());
+        blockReader = new BlockReaderFactory(dfs.getConf()).
+            setFileName(file).
+            setBlock(block).
+            setBlockToken(lblock.getBlockToken()).
+            setStartOffset(0).
+            setLength(-1).
+            setVerifyChecksum(true).
+            setClientName("fsck").
+            setDatanodeInfo(chosenNode).
+            setInetSocketAddress(targetAddr).
+            setCachingStrategy(CachingStrategy.newDropBehind()).
+            setClientCacheContext(dfs.getClientContext()).
+            setConfiguration(namenode.conf).
+            setRemotePeerFactory(new RemotePeerFactory() {
+              @Override
+              public Peer newConnectedPeer(InetSocketAddress addr)
+                  throws IOException {
+                Peer peer = null;
+                Socket s = NetUtils.getDefaultSocketFactory(conf).createSocket();
+                try {
+                  s.connect(addr, HdfsServerConstants.READ_TIMEOUT);
+                  s.setSoTimeout(HdfsServerConstants.READ_TIMEOUT);
+                  peer = TcpPeerServer.peerFromSocketAndKey(s, namenode.getRpcServer().
+                        getDataEncryptionKey());
+                } finally {
+                  if (peer == null) {
+                    IOUtils.closeQuietly(s);
+                  }
+                }
+                return peer;
+              }
+            }).
+            build();
       }  catch (IOException ex) {
         // Put chosen node into dead list, continue
         LOG.info("Failed to connect to " + targetAddr + ":" + ex);
         deadNodes.add(chosenNode);
-        if (s != null) {
-          try {
-            s.close();
-          } catch (IOException iex) {
-          }
-        }
-        s = null;
       }
     }
-    if (blockReader == null) {
-      throw new Exception("Could not open data stream for " + lblock.getBlock());
-    }
     byte[] buf = new byte[1024];
     int cnt = 0;
     boolean success = true;
@@ -638,10 +654,11 @@ public class NamenodeFsck {
       LOG.error("Error reading block", e);
       success = false;
     } finally {
-      try {s.close(); } catch (Exception e1) {}
+      blockReader.close();
     }
-    if (!success)
+    if (!success) {
       throw new Exception("Could not copy block data for " + lblock.getBlock());
+    }
   }
       
   /*

+ 4 - 2
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/SaveNamespaceContext.java

@@ -22,6 +22,7 @@ import java.util.Collections;
 import java.util.List;
 import java.util.concurrent.CountDownLatch;
 
+import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory;
 import org.apache.hadoop.hdfs.util.Canceler;
 
@@ -32,7 +33,8 @@ import com.google.common.base.Preconditions;
  * allows cancellation, and also is responsible for accumulating
  * failed storage directories.
  */
-class SaveNamespaceContext {
+@InterfaceAudience.Private
+public class SaveNamespaceContext {
   private final FSNamesystem sourceNamesystem;
   private final long txid;
   private final List<StorageDirectory> errorSDs =
@@ -72,7 +74,7 @@ class SaveNamespaceContext {
     completionLatch.countDown();
   }
 
-  void checkCancelled() throws SaveNamespaceCancelledException {
+  public void checkCancelled() throws SaveNamespaceCancelledException {
     if (canceller.isCancelled()) {
       throw new SaveNamespaceCancelledException(
           canceller.getCancellationReason());

+ 5 - 1
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/snapshot/DirectoryWithSnapshotFeature.java

@@ -244,7 +244,7 @@ public class DirectoryWithSnapshotFeature implements INode.Feature {
       this.isSnapshotRoot = isSnapshotRoot;
     }
 
-    ChildrenDiff getChildrenDiff() {
+    public ChildrenDiff getChildrenDiff() {
       return diff;
     }
     
@@ -343,6 +343,10 @@ public class DirectoryWithSnapshotFeature implements INode.Feature {
       return super.toString() + " childrenSize=" + childrenSize + ", " + diff;
     }
 
+    int getChildrenSize() {
+      return childrenSize;
+    }
+
     @Override
     void write(DataOutput out, ReferenceMap referenceMap) throws IOException {
       writeSnapshot(out);

+ 506 - 0
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/snapshot/FSImageFormatPBSnapshot.java

@@ -0,0 +1,506 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs.server.namenode.snapshot;
+
+import static org.apache.hadoop.hdfs.server.namenode.FSImageFormatPBINode.Loader.loadINodeDirectory;
+import static org.apache.hadoop.hdfs.server.namenode.FSImageFormatPBINode.Loader.loadPermission;
+import static org.apache.hadoop.hdfs.server.namenode.FSImageFormatPBINode.Loader.updateBlocksMap;
+import static org.apache.hadoop.hdfs.server.namenode.FSImageFormatPBINode.Saver.buildINodeDirectory;
+import static org.apache.hadoop.hdfs.server.namenode.FSImageFormatPBINode.Saver.buildINodeFile;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.fs.permission.PermissionStatus;
+import org.apache.hadoop.hdfs.server.namenode.FSDirectory;
+import org.apache.hadoop.hdfs.server.namenode.FSImageFormatProtobuf;
+import org.apache.hadoop.hdfs.server.namenode.FSImageFormatProtobuf.SectionName;
+import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
+import org.apache.hadoop.hdfs.server.namenode.FsImageProto.FileSummary;
+import org.apache.hadoop.hdfs.server.namenode.FsImageProto.INodeReferenceSection;
+import org.apache.hadoop.hdfs.server.namenode.FsImageProto.INodeSection;
+import org.apache.hadoop.hdfs.server.namenode.FsImageProto.SnapshotDiffSection;
+import org.apache.hadoop.hdfs.server.namenode.FsImageProto.SnapshotDiffSection.CreatedListEntry;
+import org.apache.hadoop.hdfs.server.namenode.FsImageProto.SnapshotDiffSection.DiffEntry.Type;
+import org.apache.hadoop.hdfs.server.namenode.FsImageProto.SnapshotSection;
+import org.apache.hadoop.hdfs.server.namenode.INode;
+import org.apache.hadoop.hdfs.server.namenode.INodeDirectory;
+import org.apache.hadoop.hdfs.server.namenode.INodeDirectoryAttributes;
+import org.apache.hadoop.hdfs.server.namenode.INodeFile;
+import org.apache.hadoop.hdfs.server.namenode.INodeFileAttributes;
+import org.apache.hadoop.hdfs.server.namenode.INodeMap;
+import org.apache.hadoop.hdfs.server.namenode.INodeReference;
+import org.apache.hadoop.hdfs.server.namenode.INodeReference.DstReference;
+import org.apache.hadoop.hdfs.server.namenode.INodeReference.WithCount;
+import org.apache.hadoop.hdfs.server.namenode.INodeReference.WithName;
+import org.apache.hadoop.hdfs.server.namenode.INodeWithAdditionalFields;
+import org.apache.hadoop.hdfs.server.namenode.SaveNamespaceContext;
+import org.apache.hadoop.hdfs.server.namenode.snapshot.DirectoryWithSnapshotFeature.DirectoryDiff;
+import org.apache.hadoop.hdfs.server.namenode.snapshot.DirectoryWithSnapshotFeature.DirectoryDiffList;
+import org.apache.hadoop.hdfs.server.namenode.snapshot.Snapshot.Root;
+import org.apache.hadoop.hdfs.util.Diff.ListType;
+
+import com.google.common.base.Preconditions;
+import com.google.protobuf.ByteString;
+
+@InterfaceAudience.Private
+public class FSImageFormatPBSnapshot {
+  /**
+   * Loading snapshot related information from protobuf based FSImage
+   */
+  public final static class Loader {
+    private final FSNamesystem fsn;
+    private final FSDirectory fsDir;
+    private final FSImageFormatProtobuf.Loader parent;
+    private final Map<Integer, Snapshot> snapshotMap;
+
+    public Loader(FSNamesystem fsn, FSImageFormatProtobuf.Loader parent) {
+      this.fsn = fsn;
+      this.fsDir = fsn.getFSDirectory();
+      this.snapshotMap = new HashMap<Integer, Snapshot>();
+      this.parent = parent;
+    }
+
+    /**
+     * The sequence of the ref node in refList must be strictly the same with
+     * the sequence in fsimage
+     */
+    public void loadINodeReferenceSection(InputStream in) throws IOException {
+      final List<INodeReference> refList = parent.getLoaderContext()
+          .getRefList();
+      while (true) {
+        INodeReferenceSection.INodeReference e = INodeReferenceSection
+            .INodeReference.parseDelimitedFrom(in);
+        if (e == null) {
+          break;
+        }
+        INodeReference ref = loadINodeReference(e);
+        refList.add(ref);
+      }
+    }
+
+    private INodeReference loadINodeReference(
+        INodeReferenceSection.INodeReference r) throws IOException {
+      long referredId = r.getReferredId();
+      INode referred = fsDir.getInode(referredId);
+      WithCount withCount = (WithCount) referred.getParentReference();
+      if (withCount == null) {
+        withCount = new INodeReference.WithCount(null, referred);
+      }
+      final INodeReference ref;
+      if (r.hasDstSnapshotId()) { // DstReference
+        ref = new INodeReference.DstReference(null, withCount,
+            r.getDstSnapshotId());
+      } else {
+        ref = new INodeReference.WithName(null, withCount, r.getName()
+            .toByteArray(), r.getLastSnapshotId());
+      }
+      return ref;
+    }
+
+    /**
+     * Load the snapshots section from fsimage. Also convert snapshottable
+     * directories into {@link INodeDirectorySnapshottable}.
+     *
+     */
+    public void loadSnapshotSection(InputStream in) throws IOException {
+      SnapshotManager sm = fsn.getSnapshotManager();
+      SnapshotSection section = SnapshotSection.parseDelimitedFrom(in);
+      int snum = section.getNumSnapshots();
+      sm.setNumSnapshots(snum);
+      sm.setSnapshotCounter(section.getSnapshotCounter());
+      for (long sdirId : section.getSnapshottableDirList()) {
+        INodeDirectory dir = fsDir.getInode(sdirId).asDirectory();
+        final INodeDirectorySnapshottable sdir;
+        if (!dir.isSnapshottable()) {
+          sdir = new INodeDirectorySnapshottable(dir);
+          fsDir.addToInodeMap(sdir);
+        } else {
+          // dir is root, and admin set root to snapshottable before
+          sdir = (INodeDirectorySnapshottable) dir;
+          sdir.setSnapshotQuota(INodeDirectorySnapshottable.SNAPSHOT_LIMIT);
+        }
+        sm.addSnapshottable(sdir);
+      }
+      loadSnapshots(in, snum);
+    }
+
+    private void loadSnapshots(InputStream in, int size) throws IOException {
+      for (int i = 0; i < size; i++) {
+        SnapshotSection.Snapshot pbs = SnapshotSection.Snapshot
+            .parseDelimitedFrom(in);
+        INodeDirectory root = loadINodeDirectory(pbs.getRoot(),
+            parent.getLoaderContext().getStringTable());
+        int sid = pbs.getSnapshotId();
+        INodeDirectorySnapshottable parent = (INodeDirectorySnapshottable) fsDir
+            .getInode(root.getId()).asDirectory();
+        Snapshot snapshot = new Snapshot(sid, root, parent);
+        // add the snapshot to parent, since we follow the sequence of
+        // snapshotsByNames when saving, we do not need to sort when loading
+        parent.addSnapshot(snapshot);
+        snapshotMap.put(sid, snapshot);
+      }
+    }
+
+    /**
+     * Load the snapshot diff section from fsimage.
+     */
+    public void loadSnapshotDiffSection(InputStream in) throws IOException {
+      final List<INodeReference> refList = parent.getLoaderContext()
+          .getRefList();
+      while (true) {
+        SnapshotDiffSection.DiffEntry entry = SnapshotDiffSection.DiffEntry
+            .parseDelimitedFrom(in);
+        if (entry == null) {
+          break;
+        }
+        long inodeId = entry.getInodeId();
+        INode inode = fsDir.getInode(inodeId);
+        SnapshotDiffSection.DiffEntry.Type type = entry.getType();
+        switch (type) {
+        case FILEDIFF:
+          loadFileDiffList(in, inode.asFile(), entry.getNumOfDiff());
+          break;
+        case DIRECTORYDIFF:
+          loadDirectoryDiffList(in, inode.asDirectory(), entry.getNumOfDiff(),
+              refList);
+          break;
+        }
+      }
+    }
+
+    /** Load FileDiff list for a file with snapshot feature */
+    private void loadFileDiffList(InputStream in, INodeFile file, int size)
+        throws IOException {
+      final FileDiffList diffs = new FileDiffList();
+      for (int i = 0; i < size; i++) {
+        SnapshotDiffSection.FileDiff pbf = SnapshotDiffSection.FileDiff
+            .parseDelimitedFrom(in);
+        INodeFileAttributes copy = null;
+        if (pbf.hasSnapshotCopy()) {
+          INodeSection.INodeFile fileInPb = pbf.getSnapshotCopy();
+          PermissionStatus permission = loadPermission(
+              fileInPb.getPermission(), parent.getLoaderContext()
+                  .getStringTable());
+          copy = new INodeFileAttributes.SnapshotCopy(pbf.getName()
+              .toByteArray(), permission, fileInPb.getModificationTime(),
+              fileInPb.getAccessTime(), (short) fileInPb.getReplication(),
+              fileInPb.getPreferredBlockSize());
+        }
+
+        FileDiff diff = new FileDiff(pbf.getSnapshotId(), copy, null,
+            pbf.getFileSize());
+        diffs.addFirst(diff);
+      }
+      file.addSnapshotFeature(diffs);
+    }
+
+    /** Load the created list in a DirectoryDiff */
+    private List<INode> loadCreatedList(InputStream in, INodeDirectory dir,
+        int size) throws IOException {
+      List<INode> clist = new ArrayList<INode>(size);
+      for (long c = 0; c < size; c++) {
+        CreatedListEntry entry = CreatedListEntry.parseDelimitedFrom(in);
+        INode created = SnapshotFSImageFormat.loadCreated(entry.getName()
+            .toByteArray(), dir);
+        clist.add(created);
+      }
+      return clist;
+    }
+
+    private void addToDeletedList(INode dnode, INodeDirectory parent) {
+      dnode.setParent(parent);
+      if (dnode.isFile()) {
+        updateBlocksMap(dnode.asFile(), fsn.getBlockManager());
+      }
+    }
+
+    /**
+     * Load the deleted list in a DirectoryDiff
+     */
+    private List<INode> loadDeletedList(final List<INodeReference> refList,
+        InputStream in, INodeDirectory dir, List<Long> deletedNodes,
+        List<Integer> deletedRefNodes)
+        throws IOException {
+      List<INode> dlist = new ArrayList<INode>(deletedRefNodes.size()
+          + deletedNodes.size());
+      // load non-reference inodes
+      for (long deletedId : deletedNodes) {
+        INode deleted = fsDir.getInode(deletedId);
+        dlist.add(deleted);
+        addToDeletedList(deleted, dir);
+      }
+      // load reference nodes in the deleted list
+      for (int refId : deletedRefNodes) {
+        INodeReference deletedRef = refList.get(refId);
+        dlist.add(deletedRef);
+        addToDeletedList(deletedRef, dir);
+      }
+
+      Collections.sort(dlist, new Comparator<INode>() {
+        @Override
+        public int compare(INode n1, INode n2) {
+          return n1.compareTo(n2.getLocalNameBytes());
+        }
+      });
+      return dlist;
+    }
+
+    /** Load DirectoryDiff list for a directory with snapshot feature */
+    private void loadDirectoryDiffList(InputStream in, INodeDirectory dir,
+        int size, final List<INodeReference> refList) throws IOException {
+      if (!dir.isWithSnapshot()) {
+        dir.addSnapshotFeature(null);
+      }
+      DirectoryDiffList diffs = dir.getDiffs();
+      for (int i = 0; i < size; i++) {
+        // load a directory diff
+        SnapshotDiffSection.DirectoryDiff diffInPb = SnapshotDiffSection.
+            DirectoryDiff.parseDelimitedFrom(in);
+        final int snapshotId = diffInPb.getSnapshotId();
+        final Snapshot snapshot = snapshotMap.get(snapshotId);
+        int childrenSize = diffInPb.getChildrenSize();
+        boolean useRoot = diffInPb.getIsSnapshotRoot();
+        INodeDirectoryAttributes copy = null;
+        if (useRoot) {
+          copy = snapshot.getRoot();
+        } else if (diffInPb.hasSnapshotCopy()) {
+          INodeSection.INodeDirectory dirCopyInPb = diffInPb.getSnapshotCopy();
+          final byte[] name = diffInPb.getName().toByteArray();
+          PermissionStatus permission = loadPermission(
+              dirCopyInPb.getPermission(), parent.getLoaderContext()
+                  .getStringTable());
+          long modTime = dirCopyInPb.getModificationTime();
+          boolean noQuota = dirCopyInPb.getNsQuota() == -1
+              && dirCopyInPb.getDsQuota() == -1;
+          copy = noQuota ? new INodeDirectoryAttributes.SnapshotCopy(name,
+              permission, modTime)
+              : new INodeDirectoryAttributes.CopyWithQuota(name, permission,
+                  modTime, dirCopyInPb.getNsQuota(), dirCopyInPb.getDsQuota());
+        }
+        // load created list
+        List<INode> clist = loadCreatedList(in, dir,
+            diffInPb.getCreatedListSize());
+        // load deleted list
+        List<INode> dlist = loadDeletedList(refList, in, dir,
+            diffInPb.getDeletedINodeList(), diffInPb.getDeletedINodeRefList());
+        // create the directory diff
+        DirectoryDiff diff = new DirectoryDiff(snapshotId, copy, null,
+            childrenSize, clist, dlist, useRoot);
+        diffs.addFirst(diff);
+      }
+    }
+  }
+
+  /**
+   * Saving snapshot related information to protobuf based FSImage
+   */
+  public final static class Saver {
+    private final FSNamesystem fsn;
+    private final FileSummary.Builder headers;
+    private final FSImageFormatProtobuf.Saver parent;
+    private final SaveNamespaceContext context;
+
+    public Saver(FSImageFormatProtobuf.Saver parent,
+        FileSummary.Builder headers, SaveNamespaceContext context,
+        FSNamesystem fsn) {
+      this.parent = parent;
+      this.headers = headers;
+      this.context = context;
+      this.fsn = fsn;
+    }
+
+    /**
+     * save all the snapshottable directories and snapshots to fsimage
+     */
+    public void serializeSnapshotSection(OutputStream out) throws IOException {
+      SnapshotManager sm = fsn.getSnapshotManager();
+      SnapshotSection.Builder b = SnapshotSection.newBuilder()
+          .setSnapshotCounter(sm.getSnapshotCounter())
+          .setNumSnapshots(sm.getNumSnapshots());
+
+      INodeDirectorySnapshottable[] snapshottables = sm.getSnapshottableDirs();
+      for (INodeDirectorySnapshottable sdir : snapshottables) {
+        b.addSnapshottableDir(sdir.getId());
+      }
+      b.build().writeDelimitedTo(out);
+      int i = 0;
+      for(INodeDirectorySnapshottable sdir : snapshottables) {
+        for(Snapshot s : sdir.getSnapshotsByNames()) {
+          Root sroot = s.getRoot();
+          SnapshotSection.Snapshot.Builder sb = SnapshotSection.Snapshot
+              .newBuilder().setSnapshotId(s.getId());
+          INodeSection.INodeDirectory.Builder db = buildINodeDirectory(sroot,
+              parent.getSaverContext().getStringMap());
+          INodeSection.INode r = INodeSection.INode.newBuilder()
+              .setId(sroot.getId())
+              .setType(INodeSection.INode.Type.DIRECTORY)
+              .setName(ByteString.copyFrom(sroot.getLocalNameBytes()))
+              .setDirectory(db).build();
+          sb.setRoot(r).build().writeDelimitedTo(out);
+          i++;
+          if (i % FSImageFormatProtobuf.Saver.CHECK_CANCEL_INTERVAL == 0) {
+            context.checkCancelled();
+          }
+        }
+      }
+      Preconditions.checkState(i == sm.getNumSnapshots());
+      parent.commitSection(headers, FSImageFormatProtobuf.SectionName.SNAPSHOT);
+    }
+
+    /**
+     * This can only be called after serializing both INode_Dir and SnapshotDiff
+     */
+    public void serializeINodeReferenceSection(OutputStream out)
+        throws IOException {
+      final List<INodeReference> refList = parent.getSaverContext()
+          .getRefList();
+      for (INodeReference ref : refList) {
+        INodeReferenceSection.INodeReference.Builder rb = buildINodeReference(ref);
+        rb.build().writeDelimitedTo(out);
+      }
+      parent.commitSection(headers, SectionName.INODE_REFRENCE);
+    }
+
+    private INodeReferenceSection.INodeReference.Builder buildINodeReference(
+        INodeReference ref) throws IOException {
+      INodeReferenceSection.INodeReference.Builder rb =
+          INodeReferenceSection.INodeReference.newBuilder().
+            setReferredId(ref.getId());
+      if (ref instanceof WithName) {
+        rb.setLastSnapshotId(((WithName) ref).getLastSnapshotId()).setName(
+            ByteString.copyFrom(ref.getLocalNameBytes()));
+      } else if (ref instanceof DstReference) {
+        rb.setDstSnapshotId(((DstReference) ref).getDstSnapshotId());
+      }
+      return rb;
+    }
+
+    /**
+     * save all the snapshot diff to fsimage
+     */
+    public void serializeSnapshotDiffSection(OutputStream out)
+        throws IOException {
+      INodeMap inodesMap = fsn.getFSDirectory().getINodeMap();
+      final List<INodeReference> refList = parent.getSaverContext()
+          .getRefList();
+      int i = 0;
+      Iterator<INodeWithAdditionalFields> iter = inodesMap.getMapIterator();
+      while (iter.hasNext()) {
+        INodeWithAdditionalFields inode = iter.next();
+        if (inode.isFile()) {
+          serializeFileDiffList(inode.asFile(), out);
+        } else if (inode.isDirectory()) {
+          serializeDirDiffList(inode.asDirectory(), refList, out);
+        }
+        ++i;
+        if (i % FSImageFormatProtobuf.Saver.CHECK_CANCEL_INTERVAL == 0) {
+          context.checkCancelled();
+        }
+      }
+      parent.commitSection(headers,
+          FSImageFormatProtobuf.SectionName.SNAPSHOT_DIFF);
+    }
+
+    private void serializeFileDiffList(INodeFile file, OutputStream out)
+        throws IOException {
+      FileWithSnapshotFeature sf = file.getFileWithSnapshotFeature();
+      if (sf != null) {
+        List<FileDiff> diffList = sf.getDiffs().asList();
+        SnapshotDiffSection.DiffEntry entry = SnapshotDiffSection.DiffEntry
+            .newBuilder().setInodeId(file.getId()).setType(Type.FILEDIFF)
+            .setNumOfDiff(diffList.size()).build();
+        entry.writeDelimitedTo(out);
+        for (int i = diffList.size() - 1; i >= 0; i--) {
+          FileDiff diff = diffList.get(i);
+          SnapshotDiffSection.FileDiff.Builder fb = SnapshotDiffSection.FileDiff
+              .newBuilder().setSnapshotId(diff.getSnapshotId())
+              .setFileSize(diff.getFileSize());
+          INodeFileAttributes copy = diff.snapshotINode;
+          if (copy != null) {
+            fb.setName(ByteString.copyFrom(copy.getLocalNameBytes()))
+                .setSnapshotCopy(buildINodeFile(copy, parent.getSaverContext().getStringMap()));
+          }
+          fb.build().writeDelimitedTo(out);
+        }
+      }
+    }
+
+    private void saveCreatedList(List<INode> created, OutputStream out)
+        throws IOException {
+      // local names of the created list member
+      for (INode c : created) {
+        SnapshotDiffSection.CreatedListEntry.newBuilder()
+            .setName(ByteString.copyFrom(c.getLocalNameBytes())).build()
+            .writeDelimitedTo(out);
+      }
+    }
+
+    private void serializeDirDiffList(INodeDirectory dir,
+        final List<INodeReference> refList, OutputStream out)
+        throws IOException {
+      DirectoryWithSnapshotFeature sf = dir.getDirectoryWithSnapshotFeature();
+      if (sf != null) {
+        List<DirectoryDiff> diffList = sf.getDiffs().asList();
+        SnapshotDiffSection.DiffEntry entry = SnapshotDiffSection.DiffEntry
+            .newBuilder().setInodeId(dir.getId()).setType(Type.DIRECTORYDIFF)
+            .setNumOfDiff(diffList.size()).build();
+        entry.writeDelimitedTo(out);
+        for (int i = diffList.size() - 1; i >= 0; i--) { // reverse order!
+          DirectoryDiff diff = diffList.get(i);
+          SnapshotDiffSection.DirectoryDiff.Builder db = SnapshotDiffSection.
+              DirectoryDiff.newBuilder().setSnapshotId(diff.getSnapshotId())
+                           .setChildrenSize(diff.getChildrenSize())
+                           .setIsSnapshotRoot(diff.isSnapshotRoot());
+          INodeDirectoryAttributes copy = diff.snapshotINode;
+          if (!diff.isSnapshotRoot() && copy != null) {
+            db.setName(ByteString.copyFrom(copy.getLocalNameBytes()))
+                .setSnapshotCopy(
+                    buildINodeDirectory(copy, parent.getSaverContext().getStringMap()));
+          }
+          // process created list and deleted list
+          List<INode> created = diff.getChildrenDiff()
+              .getList(ListType.CREATED);
+          db.setCreatedListSize(created.size());
+          List<INode> deleted = diff.getChildrenDiff().getList(ListType.DELETED);
+          for (INode d : deleted) {
+            if (d.isReference()) {
+              refList.add(d.asReference());
+              db.addDeletedINodeRef(refList.size() - 1);
+            } else {
+              db.addDeletedINode(d.getId());
+            }
+          }
+          db.build().writeDelimitedTo(out);
+          saveCreatedList(created, out);
+        }
+      }
+    }
+  }
+
+  private FSImageFormatPBSnapshot(){}
+}

+ 15 - 11
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/snapshot/SnapshotFSImageFormat.java

@@ -27,7 +27,6 @@ import java.util.Map;
 
 import org.apache.hadoop.hdfs.DFSUtil;
 import org.apache.hadoop.hdfs.server.namenode.FSImageFormat;
-import org.apache.hadoop.hdfs.server.namenode.FSImageFormat.Loader;
 import org.apache.hadoop.hdfs.server.namenode.FSImageSerialization;
 import org.apache.hadoop.hdfs.server.namenode.INode;
 import org.apache.hadoop.hdfs.server.namenode.INodeAttributes;
@@ -137,7 +136,7 @@ public class SnapshotFSImageFormat {
    * @param parent The directory that the created list belongs to.
    * @return The created node.
    */
-  private static INode loadCreated(byte[] createdNodeName,
+  public static INode loadCreated(byte[] createdNodeName,
       INodeDirectory parent) throws IOException {
     // the INode in the created list should be a reference to another INode
     // in posterior SnapshotDiffs or one of the current children
@@ -209,11 +208,13 @@ public class SnapshotFSImageFormat {
   
   /**
    * Load snapshots and snapshotQuota for a Snapshottable directory.
-   * @param snapshottableParent The snapshottable directory for loading.
-   * @param numSnapshots The number of snapshots that the directory has.
-   * @param in The {@link DataInput} instance to read.
-   * @param loader The {@link Loader} instance that this loading procedure is 
-   *               using.
+   *
+   * @param snapshottableParent
+   *          The snapshottable directory for loading.
+   * @param numSnapshots
+   *          The number of snapshots that the directory has.
+   * @param loader
+   *          The loader
    */
   public static void loadSnapshotList(
       INodeDirectorySnapshottable snapshottableParent, int numSnapshots,
@@ -231,10 +232,13 @@ public class SnapshotFSImageFormat {
   /**
    * Load the {@link SnapshotDiff} list for the INodeDirectoryWithSnapshot
    * directory.
-   * @param dir The snapshottable directory for loading.
-   * @param in The {@link DataInput} instance to read.
-   * @param loader The {@link Loader} instance that this loading procedure is 
-   *               using.
+   *
+   * @param dir
+   *          The snapshottable directory for loading.
+   * @param in
+   *          The {@link DataInput} instance to read.
+   * @param loader
+   *          The loader
    */
   public static void loadDirectoryDiffList(INodeDirectory dir,
       DataInput in, FSImageFormat.Loader loader) throws IOException {

+ 17 - 0
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/snapshot/SnapshotManager.java

@@ -270,6 +270,23 @@ public class SnapshotManager implements SnapshotStats {
     return numSnapshots.get();
   }
   
+  void setNumSnapshots(int num) {
+    numSnapshots.set(num);
+  }
+
+  int getSnapshotCounter() {
+    return snapshotCounter;
+  }
+
+  void setSnapshotCounter(int counter) {
+    snapshotCounter = counter;
+  }
+
+  INodeDirectorySnapshottable[] getSnapshottableDirs() {
+    return snapshottables.values().toArray(
+        new INodeDirectorySnapshottable[snapshottables.size()]);
+  }
+
   /**
    * Write {@link #snapshotCounter}, {@link #numSnapshots},
    * and all snapshots to the DataOutput.

+ 19 - 5
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/web/resources/NamenodeWebHdfsMethods.java

@@ -107,6 +107,7 @@ import org.apache.hadoop.security.UserGroupInformation;
 import org.apache.hadoop.security.token.Token;
 import org.apache.hadoop.security.token.TokenIdentifier;
 
+import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.Charsets;
 import com.sun.jersey.spi.container.ResourceFilters;
 
@@ -160,9 +161,10 @@ public class NamenodeWebHdfsMethods {
     response.setContentType(null);
   }
 
+  @VisibleForTesting
   static DatanodeInfo chooseDatanode(final NameNode namenode,
       final String path, final HttpOpParam.Op op, final long openOffset,
-      final long blocksize, final Configuration conf) throws IOException {
+      final long blocksize) throws IOException {
     final BlockManager bm = namenode.getNamesystem().getBlockManager();
 
     if (op == PutOpParam.Op.CREATE) {
@@ -201,7 +203,7 @@ public class NamenodeWebHdfsMethods {
         final LocatedBlocks locations = np.getBlockLocations(path, offset, 1);
         final int count = locations.locatedBlockCount();
         if (count > 0) {
-          return JspHelper.bestNode(locations.get(0).getLocations(), false, conf);
+          return bestNode(locations.get(0).getLocations());
         }
       }
     } 
@@ -210,13 +212,26 @@ public class NamenodeWebHdfsMethods {
         ).chooseRandom(NodeBase.ROOT);
   }
 
+  /**
+   * Choose the datanode to redirect the request. Note that the nodes have been
+   * sorted based on availability and network distances, thus it is sufficient
+   * to return the first element of the node here.
+   */
+  private static DatanodeInfo bestNode(DatanodeInfo[] nodes) throws IOException {
+    if (nodes.length == 0 || nodes[0].isDecommissioned()) {
+      throw new IOException("No active nodes contain this block");
+    }
+    return nodes[0];
+  }
+
   private Token<? extends TokenIdentifier> generateDelegationToken(
       final NameNode namenode, final UserGroupInformation ugi,
       final String renewer) throws IOException {
     final Credentials c = DelegationTokenSecretManager.createCredentials(
         namenode, ugi, renewer != null? renewer: ugi.getShortUserName());
     final Token<? extends TokenIdentifier> t = c.getAllTokens().iterator().next();
-    Text kind = request.getScheme().equals("http") ? WebHdfsFileSystem.TOKEN_KIND : SWebHdfsFileSystem.TOKEN_KIND;
+    Text kind = request.getScheme().equals("http") ? WebHdfsFileSystem.TOKEN_KIND
+        : SWebHdfsFileSystem.TOKEN_KIND;
     t.setKind(kind);
     return t;
   }
@@ -227,9 +242,8 @@ public class NamenodeWebHdfsMethods {
       final String path, final HttpOpParam.Op op, final long openOffset,
       final long blocksize,
       final Param<?, ?>... parameters) throws URISyntaxException, IOException {
-    final Configuration conf = (Configuration)context.getAttribute(JspHelper.CURRENT_CONF);
     final DatanodeInfo dn = chooseDatanode(namenode, path, op, openOffset,
-        blocksize, conf);
+        blocksize);
 
     final String delegationQuery;
     if (!UserGroupInformation.isSecurityEnabled()) {

+ 160 - 0
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/offlineImageViewer/FileDistributionCalculator.java

@@ -0,0 +1,160 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs.tools.offlineImageViewer;
+
+import java.io.BufferedInputStream;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.PrintWriter;
+import java.io.RandomAccessFile;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hdfs.protocol.proto.HdfsProtos.BlockProto;
+import org.apache.hadoop.hdfs.server.namenode.FSImageFormatProtobuf.SectionName;
+import org.apache.hadoop.hdfs.server.namenode.FSImageUtil;
+import org.apache.hadoop.hdfs.server.namenode.FsImageProto.FileSummary;
+import org.apache.hadoop.hdfs.server.namenode.FsImageProto.INodeSection;
+import org.apache.hadoop.io.IOUtils;
+
+import com.google.common.base.Preconditions;
+import com.google.common.io.LimitInputStream;
+
+/**
+ * This is the tool for analyzing file sizes in the namespace image. In order to
+ * run the tool one should define a range of integers <tt>[0, maxSize]</tt> by
+ * specifying <tt>maxSize</tt> and a <tt>step</tt>. The range of integers is
+ * divided into segments of size <tt>step</tt>:
+ * <tt>[0, s<sub>1</sub>, ..., s<sub>n-1</sub>, maxSize]</tt>, and the visitor
+ * calculates how many files in the system fall into each segment
+ * <tt>[s<sub>i-1</sub>, s<sub>i</sub>)</tt>. Note that files larger than
+ * <tt>maxSize</tt> always fall into the very last segment.
+ *
+ * <h3>Input.</h3>
+ * <ul>
+ * <li><tt>filename</tt> specifies the location of the image file;</li>
+ * <li><tt>maxSize</tt> determines the range <tt>[0, maxSize]</tt> of files
+ * sizes considered by the visitor;</li>
+ * <li><tt>step</tt> the range is divided into segments of size step.</li>
+ * </ul>
+ *
+ * <h3>Output.</h3> The output file is formatted as a tab separated two column
+ * table: Size and NumFiles. Where Size represents the start of the segment, and
+ * numFiles is the number of files form the image which size falls in this
+ * segment.
+ *
+ */
+final class FileDistributionCalculator {
+  private final static long MAX_SIZE_DEFAULT = 0x2000000000L; // 1/8 TB = 2^37
+  private final static int INTERVAL_DEFAULT = 0x200000; // 2 MB = 2^21
+
+  private final Configuration conf;
+  private final long maxSize;
+  private final int steps;
+  private final PrintWriter out;
+
+  private int[] distribution;
+  private int totalFiles;
+  private int totalDirectories;
+  private int totalBlocks;
+  private long totalSpace;
+  private long maxFileSize;
+
+  FileDistributionCalculator(Configuration conf, long maxSize, int steps,
+      PrintWriter out) {
+    this.conf = conf;
+    this.maxSize = maxSize == 0 ? MAX_SIZE_DEFAULT : maxSize;
+    this.steps = steps == 0 ? INTERVAL_DEFAULT : steps;
+    this.out = out;
+    long numIntervals = this.maxSize / this.steps;
+    this.distribution = new int[1 + (int) (numIntervals)];
+    Preconditions.checkState(numIntervals < Integer.MAX_VALUE,
+        "Too many distribution intervals");
+  }
+
+  void visit(RandomAccessFile file) throws IOException {
+    if (!FSImageUtil.checkFileFormat(file)) {
+      throw new IOException("Unrecognized FSImage");
+    }
+
+    FileSummary summary = FSImageUtil.loadSummary(file);
+    FileInputStream in = null;
+    try {
+      in = new FileInputStream(file.getFD());
+      for (FileSummary.Section s : summary.getSectionsList()) {
+        if (SectionName.fromString(s.getName()) != SectionName.INODE) {
+          continue;
+        }
+
+        in.getChannel().position(s.getOffset());
+        InputStream is = FSImageUtil.wrapInputStreamForCompression(conf,
+            summary.getCodec(), new BufferedInputStream(new LimitInputStream(
+                in, s.getLength())));
+        run(is);
+        output();
+      }
+    } finally {
+      IOUtils.cleanup(null, in);
+    }
+  }
+
+  private void run(InputStream in) throws IOException {
+    INodeSection s = INodeSection.parseDelimitedFrom(in);
+    for (int i = 0; i < s.getNumInodes(); ++i) {
+      INodeSection.INode p = INodeSection.INode.parseDelimitedFrom(in);
+      if (p.getType() == INodeSection.INode.Type.FILE) {
+        ++totalFiles;
+        INodeSection.INodeFile f = p.getFile();
+        totalBlocks += f.getBlocksCount();
+        long fileSize = 0;
+        for (BlockProto b : f.getBlocksList()) {
+          fileSize += b.getNumBytes() * f.getReplication();
+        }
+        maxFileSize = Math.max(fileSize, maxFileSize);
+        totalSpace += fileSize;
+
+        int bucket = fileSize > maxSize ? distribution.length - 1 : (int) Math
+            .ceil((double)fileSize / steps);
+        ++distribution[bucket];
+
+      } else if (p.getType() == INodeSection.INode.Type.DIRECTORY) {
+        ++totalDirectories;
+      }
+
+      if (i % (1 << 20) == 0) {
+        out.println("Processed " + i + " inodes.");
+      }
+    }
+  }
+
+  private void output() {
+    // write the distribution into the output file
+    out.print("Size\tNumFiles\n");
+    for (int i = 0; i < distribution.length; i++) {
+      if (distribution[i] != 0) {
+        out.print(((long) i * steps) + "\t" + distribution[i]);
+        out.print('\n');
+      }
+    }
+    out.print("totalFiles = " + totalFiles + "\n");
+    out.print("totalDirectories = " + totalDirectories + "\n");
+    out.print("totalBlocks = " + totalBlocks + "\n");
+    out.print("totalSpace = " + totalSpace + "\n");
+    out.print("maxFileSize = " + maxFileSize + "\n");
+  }
+}

+ 250 - 0
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/offlineImageViewer/LsrPBImage.java

@@ -0,0 +1,250 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs.tools.offlineImageViewer;
+
+import java.io.BufferedInputStream;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.PrintWriter;
+import java.io.RandomAccessFile;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.permission.PermissionStatus;
+import org.apache.hadoop.hdfs.protocol.proto.HdfsProtos.BlockProto;
+import org.apache.hadoop.hdfs.server.namenode.FSImageFormatPBINode;
+import org.apache.hadoop.hdfs.server.namenode.FSImageFormatProtobuf.SectionName;
+import org.apache.hadoop.hdfs.server.namenode.FSImageUtil;
+import org.apache.hadoop.hdfs.server.namenode.FsImageProto.FileSummary;
+import org.apache.hadoop.hdfs.server.namenode.FsImageProto.INodeDirectorySection;
+import org.apache.hadoop.hdfs.server.namenode.FsImageProto.INodeReferenceSection;
+import org.apache.hadoop.hdfs.server.namenode.FsImageProto.INodeSection;
+import org.apache.hadoop.hdfs.server.namenode.FsImageProto.INodeSection.INode;
+import org.apache.hadoop.hdfs.server.namenode.FsImageProto.INodeSection.INodeDirectory;
+import org.apache.hadoop.hdfs.server.namenode.FsImageProto.INodeSection.INodeFile;
+import org.apache.hadoop.hdfs.server.namenode.FsImageProto.INodeSection.INodeSymlink;
+import org.apache.hadoop.hdfs.server.namenode.FsImageProto.StringTableSection;
+import org.apache.hadoop.hdfs.server.namenode.INodeId;
+import org.apache.hadoop.io.IOUtils;
+
+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
+import com.google.common.io.LimitInputStream;
+
+/**
+ * This is the tool for analyzing file sizes in the namespace image. In order to
+ * run the tool one should define a range of integers <tt>[0, maxSize]</tt> by
+ * specifying <tt>maxSize</tt> and a <tt>step</tt>. The range of integers is
+ * divided into segments of size <tt>step</tt>:
+ * <tt>[0, s<sub>1</sub>, ..., s<sub>n-1</sub>, maxSize]</tt>, and the visitor
+ * calculates how many files in the system fall into each segment
+ * <tt>[s<sub>i-1</sub>, s<sub>i</sub>)</tt>. Note that files larger than
+ * <tt>maxSize</tt> always fall into the very last segment.
+ *
+ * <h3>Input.</h3>
+ * <ul>
+ * <li><tt>filename</tt> specifies the location of the image file;</li>
+ * <li><tt>maxSize</tt> determines the range <tt>[0, maxSize]</tt> of files
+ * sizes considered by the visitor;</li>
+ * <li><tt>step</tt> the range is divided into segments of size step.</li>
+ * </ul>
+ *
+ * <h3>Output.</h3> The output file is formatted as a tab separated two column
+ * table: Size and NumFiles. Where Size represents the start of the segment, and
+ * numFiles is the number of files form the image which size falls in this
+ * segment.
+ * 
+ */
+final class LsrPBImage {
+  private final Configuration conf;
+  private final PrintWriter out;
+  private String[] stringTable;
+  private HashMap<Long, INodeSection.INode> inodes = Maps.newHashMap();
+  private HashMap<Long, long[]> dirmap = Maps.newHashMap();
+  private ArrayList<INodeReferenceSection.INodeReference> refList = Lists.newArrayList();
+
+  public LsrPBImage(Configuration conf, PrintWriter out) {
+    this.conf = conf;
+    this.out = out;
+  }
+
+  public void visit(RandomAccessFile file) throws IOException {
+    if (!FSImageUtil.checkFileFormat(file)) {
+      throw new IOException("Unrecognized FSImage");
+    }
+
+    FileSummary summary = FSImageUtil.loadSummary(file);
+    FileInputStream fin = null;
+    try {
+      fin = new FileInputStream(file.getFD());
+
+      ArrayList<FileSummary.Section> sections = Lists.newArrayList(summary
+          .getSectionsList());
+      Collections.sort(sections, new Comparator<FileSummary.Section>() {
+        @Override
+        public int compare(FileSummary.Section s1, FileSummary.Section s2) {
+          SectionName n1 = SectionName.fromString(s1.getName());
+          SectionName n2 = SectionName.fromString(s2.getName());
+          if (n1 == null) {
+            return n2 == null ? 0 : -1;
+          } else if (n2 == null) {
+            return -1;
+          } else {
+            return n1.ordinal() - n2.ordinal();
+          }
+        }
+      });
+
+      for (FileSummary.Section s : sections) {
+        fin.getChannel().position(s.getOffset());
+        InputStream is = FSImageUtil.wrapInputStreamForCompression(conf,
+            summary.getCodec(), new BufferedInputStream(new LimitInputStream(
+                fin, s.getLength())));
+
+        switch (SectionName.fromString(s.getName())) {
+        case STRING_TABLE:
+          loadStringTable(is);
+          break;
+        case INODE:
+          loadINodeSection(is);
+          break;
+        case INODE_REFRENCE:
+          loadINodeReferenceSection(is);
+          break;
+        case INODE_DIR:
+          loadINodeDirectorySection(is);
+          break;
+        default:
+          break;
+        }
+      }
+      list("", INodeId.ROOT_INODE_ID);
+    } finally {
+      IOUtils.cleanup(null, fin);
+    }
+  }
+
+  private void list(String parent, long dirId) {
+    INode inode = inodes.get(dirId);
+    listINode(parent.isEmpty() ? "/" : parent, inode);
+    long[] children = dirmap.get(dirId);
+    if (children == null) {
+      return;
+    }
+    String newParent = parent + inode.getName().toStringUtf8() + "/";
+    for (long cid : children) {
+      list(newParent, cid);
+    }
+  }
+
+  private void listINode(String parent, INode inode) {
+    switch (inode.getType()) {
+    case FILE: {
+      INodeFile f = inode.getFile();
+      PermissionStatus p = FSImageFormatPBINode.Loader.loadPermission(
+          f.getPermission(), stringTable);
+      out.print(String.format("-%s %2s %8s %10s %10s %10d %s%s\n", p
+          .getPermission().toString(), f.getReplication(), p.getUserName(), p
+          .getGroupName(), f.getModificationTime(), getFileSize(f), parent,
+          inode.getName().toStringUtf8()));
+    }
+      break;
+    case DIRECTORY: {
+      INodeDirectory d = inode.getDirectory();
+      PermissionStatus p = FSImageFormatPBINode.Loader.loadPermission(
+          d.getPermission(), stringTable);
+      out.print(String.format("d%s  - %8s %10s %10s %10d %s%s\n", p
+          .getPermission().toString(), p.getUserName(), p.getGroupName(), d
+          .getModificationTime(), 0, parent, inode.getName().toStringUtf8()));
+    }
+      break;
+    case SYMLINK: {
+      INodeSymlink d = inode.getSymlink();
+      PermissionStatus p = FSImageFormatPBINode.Loader.loadPermission(
+          d.getPermission(), stringTable);
+      out.print(String.format("-%s  - %8s %10s %10s %10d %s%s -> %s\n", p
+          .getPermission().toString(), p.getUserName(), p.getGroupName(), 0, 0,
+          parent, inode.getName().toStringUtf8(), d.getTarget().toStringUtf8()));
+    }
+      break;
+    default:
+      break;
+    }
+  }
+
+  private long getFileSize(INodeFile f) {
+    long size = 0;
+    for (BlockProto p : f.getBlocksList()) {
+      size += p.getNumBytes();
+    }
+    return size;
+  }
+
+  private void loadINodeDirectorySection(InputStream in) throws IOException {
+    while (true) {
+      INodeDirectorySection.DirEntry e = INodeDirectorySection.DirEntry
+          .parseDelimitedFrom(in);
+      // note that in is a LimitedInputStream
+      if (e == null) {
+        break;
+      }
+      long[] l = new long[e.getChildrenCount() + e.getRefChildrenCount()];
+      for (int i = 0; i < e.getChildrenCount(); ++i) {
+        l[i] = e.getChildren(i);
+      }
+      for (int i = e.getChildrenCount(); i < l.length; i++) {
+        int refId = e.getRefChildren(i - e.getChildrenCount());
+        l[i] = refList.get(refId).getReferredId();
+      }
+      dirmap.put(e.getParent(), l);
+    }
+  }
+
+  private void loadINodeReferenceSection(InputStream in) throws IOException {
+    while (true) {
+      INodeReferenceSection.INodeReference e = INodeReferenceSection
+          .INodeReference.parseDelimitedFrom(in);
+      if (e == null) {
+        break;
+      }
+      refList.add(e);
+    }
+  }
+
+  private void loadINodeSection(InputStream in) throws IOException {
+    INodeSection s = INodeSection.parseDelimitedFrom(in);
+    for (int i = 0; i < s.getNumInodes(); ++i) {
+      INodeSection.INode p = INodeSection.INode.parseDelimitedFrom(in);
+      inodes.put(p.getId(), p);
+    }
+  }
+
+  private void loadStringTable(InputStream in) throws IOException {
+    StringTableSection s = StringTableSection.parseDelimitedFrom(in);
+    stringTable = new String[s.getNumEntry() + 1];
+    for (int i = 0; i < s.getNumEntry(); ++i) {
+      StringTableSection.Entry e = StringTableSection.Entry
+          .parseDelimitedFrom(in);
+      stringTable[e.getId()] = e.getStr();
+    }
+  }
+}

+ 178 - 0
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/offlineImageViewer/OfflineImageViewerPB.java

@@ -0,0 +1,178 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs.tools.offlineImageViewer;
+
+import java.io.EOFException;
+import java.io.File;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.io.RandomAccessFile;
+
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.CommandLineParser;
+import org.apache.commons.cli.OptionBuilder;
+import org.apache.commons.cli.Options;
+import org.apache.commons.cli.ParseException;
+import org.apache.commons.cli.PosixParser;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.conf.Configuration;
+
+/**
+ * OfflineImageViewer to dump the contents of an Hadoop image file to XML or the
+ * console. Main entry point into utility, either via the command line or
+ * programatically.
+ */
+@InterfaceAudience.Private
+public class OfflineImageViewerPB {
+  public static final Log LOG = LogFactory.getLog(OfflineImageViewerPB.class);
+
+  private final static String usage = "Usage: bin/hdfs oiv [OPTIONS] -i INPUTFILE -o OUTPUTFILE\n"
+      + "Offline Image Viewer\n"
+      + "View a Hadoop fsimage INPUTFILE using the specified PROCESSOR,\n"
+      + "saving the results in OUTPUTFILE.\n"
+      + "\n"
+      + "The oiv utility will attempt to parse correctly formed image files\n"
+      + "and will abort fail with mal-formed image files.\n"
+      + "\n"
+      + "The tool works offline and does not require a running cluster in\n"
+      + "order to process an image file.\n"
+      + "\n"
+      + "The following image processors are available:\n"
+      + "  * Ls: The default image processor generates an lsr-style listing\n"
+      + "    of the files in the namespace, with the same fields in the same\n"
+      + "    order.  Note that in order to correctly determine file sizes,\n"
+      + "    this formatter cannot skip blocks and will override the\n"
+      + "    -skipBlocks option.\n"
+      + "  * XML: This processor creates an XML document with all elements of\n"
+      + "    the fsimage enumerated, suitable for further analysis by XML\n"
+      + "    tools.\n"
+      + "  * FileDistribution: This processor analyzes the file size\n"
+      + "    distribution in the image.\n"
+      + "    -maxSize specifies the range [0, maxSize] of file sizes to be\n"
+      + "     analyzed (128GB by default).\n"
+      + "    -step defines the granularity of the distribution. (2MB by default)\n"
+      + "\n"
+      + "Required command line arguments:\n"
+      + "-i,--inputFile <arg>   FSImage file to process.\n"
+      + "-o,--outputFile <arg>  Name of output file. If the specified\n"
+      + "                       file exists, it will be overwritten.\n"
+      + "\n"
+      + "Optional command line arguments:\n"
+      + "-p,--processor <arg>   Select which type of processor to apply\n"
+      + "                       against image file."
+      + " (Ls|XML|FileDistribution).\n"
+      + "-h,--help              Display usage information and exit\n";
+
+  /**
+   * Build command-line options and descriptions
+   */
+  private static Options buildOptions() {
+    Options options = new Options();
+
+    // Build in/output file arguments, which are required, but there is no
+    // addOption method that can specify this
+    OptionBuilder.isRequired();
+    OptionBuilder.hasArgs();
+    OptionBuilder.withLongOpt("outputFile");
+    options.addOption(OptionBuilder.create("o"));
+
+    OptionBuilder.isRequired();
+    OptionBuilder.hasArgs();
+    OptionBuilder.withLongOpt("inputFile");
+    options.addOption(OptionBuilder.create("i"));
+
+    options.addOption("p", "processor", true, "");
+    options.addOption("h", "help", false, "");
+    options.addOption("skipBlocks", false, "");
+    options.addOption("printToScreen", false, "");
+    options.addOption("delimiter", true, "");
+
+    return options;
+  }
+
+  /**
+   * Entry point to command-line-driven operation. User may specify options and
+   * start fsimage viewer from the command line. Program will process image file
+   * and exit cleanly or, if an error is encountered, inform user and exit.
+   * 
+   * @param args
+   *          Command line options
+   * @throws IOException
+   */
+  public static void main(String[] args) throws IOException {
+    Options options = buildOptions();
+    if (args.length == 0) {
+      printUsage();
+      return;
+    }
+
+    CommandLineParser parser = new PosixParser();
+    CommandLine cmd;
+
+    try {
+      cmd = parser.parse(options, args);
+    } catch (ParseException e) {
+      System.out.println("Error parsing command-line options: ");
+      printUsage();
+      return;
+    }
+
+    if (cmd.hasOption("h")) { // print help and exit
+      printUsage();
+      return;
+    }
+
+    String inputFile = cmd.getOptionValue("i");
+    String processor = cmd.getOptionValue("p", "Ls");
+    String outputFile = cmd.getOptionValue("o");
+
+    PrintWriter out = (outputFile == null || outputFile.equals("-")) ? new PrintWriter(
+        System.out) : new PrintWriter(new File(outputFile));
+
+    Configuration conf = new Configuration();
+    try {
+      if (processor.equals("FileDistribution")) {
+        long maxSize = Long.parseLong(cmd.getOptionValue("maxSize", "0"));
+        int step = Integer.parseInt(cmd.getOptionValue("step", "0"));
+        new FileDistributionCalculator(conf, maxSize, step, out)
+            .visit(new RandomAccessFile(inputFile, "r"));
+      } else if (processor.equals("XML")) {
+        new PBImageXmlWriter(conf, out).visit(new RandomAccessFile(inputFile,
+            "r"));
+      } else {
+        new LsrPBImage(conf, out).visit(new RandomAccessFile(inputFile, "r"));
+      }
+    } catch (EOFException e) {
+      System.err.println("Input file ended unexpectedly. Exiting");
+    } catch (IOException e) {
+      System.err.println("Encountered exception.  Exiting: " + e.getMessage());
+    } finally {
+      out.close();
+    }
+
+  }
+
+  /**
+   * Print application usage instructions.
+   */
+  private static void printUsage() {
+    System.out.println(usage);
+  }
+}

+ 433 - 0
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/offlineImageViewer/PBImageXmlWriter.java

@@ -0,0 +1,433 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs.tools.offlineImageViewer;
+
+import java.io.BufferedInputStream;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.PrintWriter;
+import java.io.RandomAccessFile;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos.CacheDirectiveInfoExpirationProto;
+import org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos.CacheDirectiveInfoProto;
+import org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos.CachePoolInfoProto;
+import org.apache.hadoop.hdfs.protocol.proto.HdfsProtos.BlockProto;
+import org.apache.hadoop.hdfs.server.namenode.FSImageFormatPBINode;
+import org.apache.hadoop.hdfs.server.namenode.FSImageFormatProtobuf.SectionName;
+import org.apache.hadoop.hdfs.server.namenode.FSImageUtil;
+import org.apache.hadoop.hdfs.server.namenode.FsImageProto.CacheManagerSection;
+import org.apache.hadoop.hdfs.server.namenode.FsImageProto.FileSummary;
+import org.apache.hadoop.hdfs.server.namenode.FsImageProto.FilesUnderConstructionSection.FileUnderConstructionEntry;
+import org.apache.hadoop.hdfs.server.namenode.FsImageProto.INodeDirectorySection;
+import org.apache.hadoop.hdfs.server.namenode.FsImageProto.INodeSection;
+import org.apache.hadoop.hdfs.server.namenode.FsImageProto.INodeSection.INodeDirectory;
+import org.apache.hadoop.hdfs.server.namenode.FsImageProto.INodeSection.INodeSymlink;
+import org.apache.hadoop.hdfs.server.namenode.FsImageProto.INodeReferenceSection;
+import org.apache.hadoop.hdfs.server.namenode.FsImageProto.NameSystemSection;
+import org.apache.hadoop.hdfs.server.namenode.FsImageProto.SecretManagerSection;
+import org.apache.hadoop.hdfs.server.namenode.FsImageProto.SnapshotDiffSection;
+import org.apache.hadoop.hdfs.server.namenode.FsImageProto.SnapshotSection;
+import org.apache.hadoop.hdfs.server.namenode.FsImageProto.StringTableSection;
+import org.apache.hadoop.io.IOUtils;
+
+import com.google.common.collect.Lists;
+import com.google.common.io.LimitInputStream;
+
+/**
+ * This is the tool for analyzing file sizes in the namespace image. In order to
+ * run the tool one should define a range of integers <tt>[0, maxSize]</tt> by
+ * specifying <tt>maxSize</tt> and a <tt>step</tt>. The range of integers is
+ * divided into segments of size <tt>step</tt>:
+ * <tt>[0, s<sub>1</sub>, ..., s<sub>n-1</sub>, maxSize]</tt>, and the visitor
+ * calculates how many files in the system fall into each segment
+ * <tt>[s<sub>i-1</sub>, s<sub>i</sub>)</tt>. Note that files larger than
+ * <tt>maxSize</tt> always fall into the very last segment.
+ *
+ * <h3>Input.</h3>
+ * <ul>
+ * <li><tt>filename</tt> specifies the location of the image file;</li>
+ * <li><tt>maxSize</tt> determines the range <tt>[0, maxSize]</tt> of files
+ * sizes considered by the visitor;</li>
+ * <li><tt>step</tt> the range is divided into segments of size step.</li>
+ * </ul>
+ *
+ * <h3>Output.</h3> The output file is formatted as a tab separated two column
+ * table: Size and NumFiles. Where Size represents the start of the segment, and
+ * numFiles is the number of files form the image which size falls in this
+ * segment.
+ *
+ */
+@InterfaceAudience.Private
+public final class PBImageXmlWriter {
+  private final Configuration conf;
+  private final PrintWriter out;
+  private String[] stringTable;
+
+  public PBImageXmlWriter(Configuration conf, PrintWriter out) {
+    this.conf = conf;
+    this.out = out;
+  }
+
+  public void visit(RandomAccessFile file) throws IOException {
+    if (!FSImageUtil.checkFileFormat(file)) {
+      throw new IOException("Unrecognized FSImage");
+    }
+
+    FileSummary summary = FSImageUtil.loadSummary(file);
+    FileInputStream fin = null;
+    try {
+      fin = new FileInputStream(file.getFD());
+      out.print("<?xml version=\"1.0\"?>\n");
+
+      ArrayList<FileSummary.Section> sections = Lists.newArrayList(summary
+          .getSectionsList());
+      Collections.sort(sections, new Comparator<FileSummary.Section>() {
+        @Override
+        public int compare(FileSummary.Section s1, FileSummary.Section s2) {
+          SectionName n1 = SectionName.fromString(s1.getName());
+          SectionName n2 = SectionName.fromString(s2.getName());
+          if (n1 == null) {
+            return n2 == null ? 0 : -1;
+          } else if (n2 == null) {
+            return -1;
+          } else {
+            return n1.ordinal() - n2.ordinal();
+          }
+        }
+      });
+
+      for (FileSummary.Section s : sections) {
+        fin.getChannel().position(s.getOffset());
+        InputStream is = FSImageUtil.wrapInputStreamForCompression(conf,
+            summary.getCodec(), new BufferedInputStream(new LimitInputStream(
+                fin, s.getLength())));
+
+        switch (SectionName.fromString(s.getName())) {
+        case NS_INFO:
+          dumpNameSection(is);
+          break;
+        case STRING_TABLE:
+          loadStringTable(is);
+          break;
+        case INODE:
+          dumpINodeSection(is);
+          break;
+        case INODE_REFRENCE:
+          dumpINodeReferenceSection(is);
+          break;
+        case INODE_DIR:
+          dumpINodeDirectorySection(is);
+          break;
+        case FILES_UNDERCONSTRUCTION:
+          dumpFileUnderConstructionSection(is);
+          break;
+        case SNAPSHOT:
+          dumpSnapshotSection(is);
+          break;
+        case SNAPSHOT_DIFF:
+          dumpSnapshotDiffSection(is);
+          break;
+        case SECRET_MANAGER:
+          dumpSecretManagerSection(is);
+          break;
+        case CACHE_MANAGER:
+          dumpCacheManagerSection(is);
+          break;
+        default:
+          break;
+        }
+      }
+    } finally {
+      IOUtils.cleanup(null, fin);
+    }
+  }
+
+  private void dumpCacheManagerSection(InputStream is) throws IOException {
+    out.print("<CacheManagerSection>");
+    CacheManagerSection s = CacheManagerSection.parseDelimitedFrom(is);
+    o("nextDirectiveId", s.getNextDirectiveId());
+    for (int i = 0; i < s.getNumPools(); ++i) {
+      CachePoolInfoProto p = CachePoolInfoProto.parseDelimitedFrom(is);
+      out.print("<pool>");
+      o("poolName", p.getPoolName()).o("ownerName", p.getOwnerName())
+          .o("groupName", p.getGroupName()).o("mode", p.getMode())
+          .o("limit", p.getLimit())
+          .o("maxRelativeExpiry", p.getMaxRelativeExpiry());
+      out.print("</pool>\n");
+    }
+    for (int i = 0; i < s.getNumPools(); ++i) {
+      CacheDirectiveInfoProto p = CacheDirectiveInfoProto
+          .parseDelimitedFrom(is);
+      out.print("<directive>");
+      o("id", p.getId()).o("path", p.getPath())
+          .o("replication", p.getReplication()).o("pool", p.getPool());
+      out.print("<expiration>");
+      CacheDirectiveInfoExpirationProto e = p.getExpiration();
+      o("millis", e.getMillis()).o("relatilve", e.getIsRelative());
+      out.print("</expiration>\n");
+      out.print("</directive>\n");
+    }
+    out.print("</CacheManagerSection>\n");
+
+  }
+
+  private void dumpFileUnderConstructionSection(InputStream in)
+      throws IOException {
+    out.print("<FileUnderConstructionSection>");
+    while (true) {
+      FileUnderConstructionEntry e = FileUnderConstructionEntry
+          .parseDelimitedFrom(in);
+      if (e == null) {
+        break;
+      }
+      out.print("<inode>");
+      o("id", e.getInodeId()).o("path", e.getFullPath());
+      out.print("</inode>\n");
+    }
+    out.print("</FileUnderConstructionSection>\n");
+  }
+
+  private void dumpINodeDirectory(INodeDirectory d) {
+    o("mtime", d.getModificationTime()).o("permission",
+        dumpPermission(d.getPermission()));
+
+    if (d.hasDsQuota() && d.hasNsQuota()) {
+      o("nsquota", d.getNsQuota()).o("dsquota", d.getDsQuota());
+    }
+  }
+
+  private void dumpINodeDirectorySection(InputStream in) throws IOException {
+    out.print("<INodeDirectorySection>");
+    while (true) {
+      INodeDirectorySection.DirEntry e = INodeDirectorySection.DirEntry
+          .parseDelimitedFrom(in);
+      // note that in is a LimitedInputStream
+      if (e == null) {
+        break;
+      }
+      out.print("<directory>");
+      o("parent", e.getParent());
+      for (long id : e.getChildrenList()) {
+        o("inode", id);
+      }
+      for (int refId : e.getRefChildrenList()) {
+        o("inodereference-index", refId);
+      }
+      out.print("</directory>\n");
+    }
+    out.print("</INodeDirectorySection>\n");
+  }
+
+  private void dumpINodeReferenceSection(InputStream in) throws IOException {
+    out.print("<INodeReferenceSection>");
+    while (true) {
+      INodeReferenceSection.INodeReference e = INodeReferenceSection
+          .INodeReference.parseDelimitedFrom(in);
+      if (e == null) {
+        break;
+      }
+      dumpINodeReference(e);
+    }
+  }
+
+  private void dumpINodeReference(INodeReferenceSection.INodeReference r) {
+    out.print("<ref>");
+    o("referredId", r.getReferredId()).o("name", r.getName().toStringUtf8())
+        .o("dstSnapshotId", r.getDstSnapshotId())
+        .o("lastSnapshotId", r.getLastSnapshotId());
+    out.print("</ref>\n");
+  }
+
+  private void dumpINodeFile(INodeSection.INodeFile f) {
+    o("replication", f.getReplication()).o("mtime", f.getModificationTime())
+        .o("atime", f.getAccessTime())
+        .o("perferredBlockSize", f.getPreferredBlockSize())
+        .o("permission", dumpPermission(f.getPermission()));
+
+    if (f.getBlocksCount() > 0) {
+      out.print("<blocks>");
+      for (BlockProto b : f.getBlocksList()) {
+        out.print("<block>");
+        o("id", b.getBlockId()).o("genstamp", b.getGenStamp()).o("numBytes",
+            b.getNumBytes());
+        out.print("</block>\n");
+      }
+      out.print("</blocks>\n");
+    }
+
+    if (f.hasFileUC()) {
+      INodeSection.FileUnderConstructionFeature u = f.getFileUC();
+      out.print("<file-under-construction>");
+      o("clientName", u.getClientName()).o("clientMachine",
+          u.getClientMachine());
+      out.print("</file-under-construction>\n");
+    }
+  }
+
+  private void dumpINodeSection(InputStream in) throws IOException {
+    INodeSection s = INodeSection.parseDelimitedFrom(in);
+    out.print("<INodeSection>");
+    o("lastInodeId", s.getLastInodeId());
+    for (int i = 0; i < s.getNumInodes(); ++i) {
+      INodeSection.INode p = INodeSection.INode.parseDelimitedFrom(in);
+      out.print("<inode>");
+      o("id", p.getId()).o("type", p.getType()).o("name",
+          p.getName().toStringUtf8());
+
+      if (p.hasFile()) {
+        dumpINodeFile(p.getFile());
+      } else if (p.hasDirectory()) {
+        dumpINodeDirectory(p.getDirectory());
+      } else if (p.hasSymlink()) {
+        dumpINodeSymlink(p.getSymlink());
+      }
+
+      out.print("</inode>\n");
+    }
+    out.print("</INodeSection>\n");
+  }
+
+  private void dumpINodeSymlink(INodeSymlink s) {
+    o("permission", dumpPermission(s.getPermission())).o("target",
+        s.getTarget().toStringUtf8());
+  }
+
+  private void dumpNameSection(InputStream in) throws IOException {
+    NameSystemSection s = NameSystemSection.parseDelimitedFrom(in);
+    out.print("<NameSection>\n");
+    o("genstampV1", s.getGenstampV1()).o("genstampV2", s.getGenstampV2())
+        .o("genstampV1Limit", s.getGenstampV1Limit())
+        .o("lastAllocatedBlockId", s.getLastAllocatedBlockId())
+        .o("txid", s.getTransactionId());
+    out.print("<NameSection>\n");
+  }
+
+  private String dumpPermission(long permission) {
+    return FSImageFormatPBINode.Loader.loadPermission(permission, stringTable)
+        .toString();
+  }
+
+  private void dumpSecretManagerSection(InputStream is) throws IOException {
+    out.print("<SecretManagerSection>");
+    SecretManagerSection s = SecretManagerSection.parseDelimitedFrom(is);
+    o("currentId", s.getCurrentId()).o("tokenSequenceNumber",
+        s.getTokenSequenceNumber());
+    out.print("</SecretManagerSection>");
+  }
+
+  private void dumpSnapshotDiffSection(InputStream in) throws IOException {
+    out.print("<SnapshotDiffSection>");
+    while (true) {
+      SnapshotDiffSection.DiffEntry e = SnapshotDiffSection.DiffEntry
+          .parseDelimitedFrom(in);
+      if (e == null) {
+        break;
+      }
+      out.print("<diff>");
+      o("inodeid", e.getInodeId());
+      switch (e.getType()) {
+      case FILEDIFF: {
+        for (int i = 0; i < e.getNumOfDiff(); ++i) {
+          out.print("<filediff>");
+          SnapshotDiffSection.FileDiff f = SnapshotDiffSection.FileDiff
+              .parseDelimitedFrom(in);
+          o("snapshotId", f.getSnapshotId()).o("size", f.getFileSize()).o(
+              "name", f.getName().toStringUtf8());
+          out.print("</filediff>\n");
+        }
+      }
+        break;
+      case DIRECTORYDIFF: {
+        for (int i = 0; i < e.getNumOfDiff(); ++i) {
+          out.print("<dirdiff>");
+          SnapshotDiffSection.DirectoryDiff d = SnapshotDiffSection.DirectoryDiff
+              .parseDelimitedFrom(in);
+          o("snapshotId", d.getSnapshotId())
+              .o("isSnapshotroot", d.getIsSnapshotRoot())
+              .o("childrenSize", d.getChildrenSize())
+              .o("name", d.getName().toStringUtf8());
+
+          for (int j = 0; j < d.getCreatedListSize(); ++j) {
+            SnapshotDiffSection.CreatedListEntry ce = SnapshotDiffSection.CreatedListEntry
+                .parseDelimitedFrom(in);
+            out.print("<created>");
+            o("name", ce.getName().toStringUtf8());
+            out.print("</created>\n");
+          }
+          for (long did : d.getDeletedINodeList()) {
+            out.print("<deleted>");
+            o("inode", did);
+            out.print("</deleted>\n");
+          }
+          for (int dRefid : d.getDeletedINodeRefList()) {
+            out.print("<deleted>");
+            o("inodereference-index", dRefid);
+            out.print("</deleted>\n");
+          }
+          out.print("</dirdiff>\n");
+        }
+      }
+        break;
+      default:
+        break;
+      }
+      out.print("</diff>");
+    }
+    out.print("<SnapshotDiffSection>\n");
+  }
+
+  private void dumpSnapshotSection(InputStream in) throws IOException {
+    out.print("<SnapshotSection>");
+    SnapshotSection s = SnapshotSection.parseDelimitedFrom(in);
+    o("snapshotCounter", s.getSnapshotCounter());
+    if (s.getSnapshottableDirCount() > 0) {
+      out.print("<snapshottableDir>");
+      for (long id : s.getSnapshottableDirList()) {
+        o("dir", id);
+      }
+      out.print("</snapshottableDir>\n");
+    }
+    for (int i = 0; i < s.getNumSnapshots(); ++i) {
+      SnapshotSection.Snapshot pbs = SnapshotSection.Snapshot
+          .parseDelimitedFrom(in);
+      o("snapshot", pbs.getSnapshotId());
+    }
+    out.print("</SnapshotSection>\n");
+  }
+
+  private void loadStringTable(InputStream in) throws IOException {
+    StringTableSection s = StringTableSection.parseDelimitedFrom(in);
+    stringTable = new String[s.getNumEntry() + 1];
+    for (int i = 0; i < s.getNumEntry(); ++i) {
+      StringTableSection.Entry e = StringTableSection.Entry
+          .parseDelimitedFrom(in);
+      stringTable[e.getId()] = e.getStr();
+    }
+  }
+
+  private PBImageXmlWriter o(final String e, final Object v) {
+    out.print("<" + e + ">" + v + "</" + e + ">");
+    return this;
+  }
+}

+ 284 - 0
hadoop-hdfs-project/hadoop-hdfs/src/main/proto/fsimage.proto

@@ -0,0 +1,284 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+option java_package = "org.apache.hadoop.hdfs.server.namenode";
+option java_outer_classname = "FsImageProto";
+
+package hadoop.hdfs.fsimage;
+
+import "hdfs.proto";
+
+/**
+ * This file defines the on-disk layout of the file system image. The
+ * layout is defined by the following EBNF grammar, in which angle
+ * brackets mark protobuf definitions. (e.g., <FileSummary>)
+ *
+ * FILE := MAGIC SECTION* <FileSummary> FileSummaryLength
+ * MAGIC := 'HDFSIMG1'
+ * SECTION := <NameSystemSection> | ...
+ * FileSummaryLength := 4 byte int
+ *
+ * Some notes:
+ *
+ * The codec field in FileSummary describes the compression codec used
+ * for all sections. The fileheader is always uncompressed.
+ *
+ * All protobuf messages are serialized in delimited form, which means
+ * that there always will be an integer indicates the size of the
+ * protobuf message.
+ *
+ */
+
+message FileSummary {
+  // The version of the above EBNF grammars.
+  required uint32 ondiskVersion = 1;
+  // layoutVersion describes which features are available in the
+  // FSImage.
+  required uint32 layoutVersion = 2;
+  optional string codec         = 3;
+  // index for each section
+  message Section {
+    optional string name = 1;
+    optional uint64 length = 2;
+    optional uint64 offset = 3;
+  }
+  repeated Section sections = 4;
+}
+
+/**
+ * Name: NS_INFO
+ */
+message NameSystemSection {
+  optional uint32 namespaceId = 1;
+  optional uint64 genstampV1 = 2;
+  optional uint64 genstampV2 = 3;
+  optional uint64 genstampV1Limit = 4;
+  optional uint64 lastAllocatedBlockId = 5;
+  optional uint64 transactionId = 6;
+}
+
+/**
+ * Permission is serialized as a 64-bit long. [0:24):[25:48):[48:64) (in Big Endian).
+ * The first and the second parts are the string ids of the user and
+ * group name, and the last 16 bits are the permission bits.
+ *
+ * Name: INODE
+ */
+message INodeSection {
+  /**
+   * under-construction feature for INodeFile
+   */
+  message FileUnderConstructionFeature {
+    optional string clientName = 1;
+    optional string clientMachine = 2;
+  }
+
+  message INodeFile {
+    optional uint32 replication = 1;
+    optional uint64 modificationTime = 2;
+    optional uint64 accessTime = 3;
+    optional uint64 preferredBlockSize = 4;
+    optional fixed64 permission = 5;
+    repeated BlockProto blocks = 6;
+    optional FileUnderConstructionFeature fileUC = 7;
+  }
+
+  message INodeDirectory {
+    optional uint64 modificationTime = 1;
+    // namespace quota
+    optional uint64 nsQuota = 2;
+    // diskspace quota
+    optional uint64 dsQuota = 3;
+    optional fixed64 permission = 4;
+  }
+
+  message INodeSymlink {
+    optional fixed64 permission = 1;
+    optional bytes target = 2;
+  }
+
+  message INode {
+    enum Type {
+      FILE = 1;
+      DIRECTORY = 2;
+      SYMLINK = 3;
+    };
+    required Type type = 1;
+    required uint64 id = 2;
+    optional bytes name = 3;
+
+    optional INodeFile file = 4;
+    optional INodeDirectory directory = 5;
+    optional INodeSymlink symlink = 6;
+  }
+
+  optional uint64 lastInodeId = 1;
+  optional uint64 numInodes = 2;
+  // repeated INodes..
+}
+
+/**
+ * This section records information about under-construction files for
+ * reconstructing the lease map.
+ * NAME: FILES_UNDERCONSTRUCTION
+ */
+message FilesUnderConstructionSection {
+  message FileUnderConstructionEntry {
+    optional uint64 inodeId = 1;
+    optional string fullPath = 2;
+  }
+  // repeated FileUnderConstructionEntry...
+}
+
+/**
+ * This section records the children of each directories
+ * NAME: INODE_DIR
+ */
+message INodeDirectorySection {
+  message DirEntry {
+    optional uint64 parent = 1;
+    // children that are not reference nodes
+    repeated uint64 children = 2 [packed = true];
+    // children that are reference nodes, each element is a reference node id
+    repeated uint32 refChildren = 3 [packed = true];
+  }
+  // repeated DirEntry, ended at the boundary of the section.
+}
+
+message INodeReferenceSection {
+  message INodeReference {
+    // id of the referred inode
+    optional uint64 referredId = 1;
+    // local name recorded in WithName
+    optional bytes name = 2;
+    // recorded in DstReference
+    optional uint32 dstSnapshotId = 3;
+    // recorded in WithName
+    optional uint32 lastSnapshotId = 4;
+  }
+  // repeated INodeReference...
+}
+
+/**
+ * This section records the information about snapshot
+ * NAME: SNAPSHOT
+ */
+message SnapshotSection {
+  message Snapshot {
+    optional uint32 snapshotId = 1;
+    // Snapshot root
+    optional INodeSection.INode root = 2;
+  }
+
+  optional uint32 snapshotCounter = 1;
+  repeated uint64 snapshottableDir = 2 [packed = true];
+  // total number of snapshots
+  optional uint32 numSnapshots = 3;
+  // repeated Snapshot...
+}
+
+/**
+ * This section records information about snapshot diffs
+ * NAME: SNAPSHOT_DIFF
+ */
+message SnapshotDiffSection {
+  message CreatedListEntry {
+    optional bytes name = 1;
+  }
+
+  message DirectoryDiff {
+    optional uint32 snapshotId = 1;
+    optional uint32 childrenSize = 2;
+    optional bool isSnapshotRoot = 3;
+    optional bytes name = 4;
+    optional INodeSection.INodeDirectory snapshotCopy = 5;
+    optional uint32 createdListSize = 6;
+    repeated uint64 deletedINode = 7 [packed = true]; // id of deleted inodes
+    // id of reference nodes in the deleted list
+    repeated uint32 deletedINodeRef = 8 [packed = true];
+    // repeated CreatedListEntry (size is specified by createdListSize)
+  }
+
+  message FileDiff {
+    optional uint32 snapshotId = 1;
+    optional uint64 fileSize = 2;
+    optional bytes name = 3;
+    optional INodeSection.INodeFile snapshotCopy = 4;
+  }
+
+  message DiffEntry {
+    enum Type {
+      FILEDIFF = 1;
+      DIRECTORYDIFF = 2;
+    }
+    required Type type = 1;
+    optional uint64 inodeId = 2;
+    optional uint32 numOfDiff = 3;
+
+    // repeated DirectoryDiff or FileDiff
+  }
+
+  // repeated DiffEntry
+}
+
+/**
+ * This section maps string to id
+ * NAME: STRING_TABLE
+ */
+message StringTableSection {
+  message Entry {
+    optional uint32 id = 1;
+    optional string str = 2;
+  }
+  optional uint32 numEntry = 1;
+  // repeated Entry
+}
+
+message SecretManagerSection {
+  message DelegationKey {
+    optional uint32 id         = 1;
+    optional uint64 expiryDate = 2;
+    optional bytes  key        = 3;
+  }
+  message PersistToken {
+    optional uint32 version        = 1;
+    optional string owner          = 2;
+    optional string renewer        = 3;
+    optional string realUser       = 4;
+    optional uint64 issueDate      = 5;
+    optional uint64 maxDate        = 6;
+    optional uint32 sequenceNumber = 7;
+    optional uint32 masterKeyId    = 8;
+    optional uint64 expiryDate     = 9;
+  }
+  optional uint32 currentId = 1;
+  optional uint32 tokenSequenceNumber = 2;
+  optional uint32 numKeys = 3;
+  optional uint32 numTokens = 4;
+  // repeated DelegationKey keys
+  // repeated PersistToken tokens
+}
+
+message CacheManagerSection {
+  required uint64 nextDirectiveId = 1;
+  required uint32 numPools        = 2;
+  required uint32 numDirectives   = 3;
+  // repeated CachePoolInfoProto pools
+  // repeated CacheDirectiveInfoProto directives
+}
+

+ 59 - 1
hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml

@@ -96,6 +96,14 @@
   </description>
 </property>
 
+<property>
+  <name>dfs.namenode.secondary.https-address</name>
+  <value>0.0.0.0:50091</value>
+  <description>
+    The secondary namenode HTTPS server address and port.
+  </description>
+</property>
+
 <property>
   <name>dfs.datanode.address</name>
   <value>0.0.0.0:50010</value>
@@ -161,6 +169,16 @@
   </description>
 </property>
 
+<property>
+  <name>dfs.client.cached.conn.retry</name>
+  <value>3</value>
+  <description>The number of times the HDFS client will pull a socket from the
+   cache.  Once this number is exceeded, the client will try to create a new
+   socket.
+  </description>
+</property>
+
+
 <property>
   <name>dfs.https.server.keystore.resource</name>
   <value>ssl-server.xml</value>
@@ -1300,7 +1318,16 @@
   <name>dfs.journalnode.http-address</name>
   <value>0.0.0.0:8480</value>
   <description>
-    The address and port the JournalNode web UI listens on.
+    The address and port the JournalNode HTTP server listens on.
+    If the port is 0 then the server will start on a free port.
+  </description>
+</property>
+
+<property>
+  <name>dfs.journalnode.https-address</name>
+  <value>0.0.0.0:8481</value>
+  <description>
+    The address and port the JournalNode HTTPS server listens on.
     If the port is 0 then the server will start on a free port.
   </description>
 </property>
@@ -1489,6 +1516,26 @@
   </description>
 </property>
 
+<property>
+  <name>dfs.client.mmap.retry.timeout.ms</name>
+  <value>300000</value>
+  <description>
+    The minimum amount of time that we will wait before retrying a failed mmap
+    operation.
+  </description>
+</property>
+
+<property>
+  <name>dfs.client.short.circuit.replica.stale.threshold.ms</name>
+  <value>3000000</value>
+  <description>
+    The maximum amount of time that we will consider a short-circuit replica to
+    be valid, if there is no communication from the DataNode.  After this time
+    has elapsed, we will re-fetch the short-circuit replica even if it is in
+    the cache.
+  </description>
+</property>
+
 <property>
   <name>dfs.namenode.path.based.cache.block.map.allocation.percent</name>
   <value>0.25</value>
@@ -1618,4 +1665,15 @@
   </description>
 </property>
 
+<property>
+  <name>dfs.client.context</name>
+  <value>default</value>
+  <description>
+    The name of the DFSClient context that we should use.  Clients that share
+    a context share a socket cache and short-circuit cache, among other things.
+    You should only change this if you don't want to share with another set of
+    threads.
+  </description>
+</property>
+
 </configuration>

+ 2 - 2
hadoop-hdfs-project/hadoop-hdfs/src/main/webapps/hdfs/dfshealth.html

@@ -1,3 +1,5 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
+    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
 <!--
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
@@ -14,8 +16,6 @@
    See the License for the specific language governing permissions and
    limitations under the License.
 -->
-<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
-    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
 <html xmlns="http://www.w3.org/1999/xhtml">
 <head>
 <link rel="stylesheet" type="text/css" href="/static/bootstrap-3.0.2/css/bootstrap.min.css" />

+ 25 - 10
hadoop-hdfs-project/hadoop-hdfs/src/main/webapps/hdfs/dfshealth.js

@@ -28,7 +28,7 @@
       {"name": "nn",      "url": "/jmx?qry=Hadoop:service=NameNode,name=NameNodeInfo"},
       {"name": "nnstat",  "url": "/jmx?qry=Hadoop:service=NameNode,name=NameNodeStatus"},
       {"name": "fs",      "url": "/jmx?qry=Hadoop:service=NameNode,name=FSNamesystemState"},
-      {"name": "mem",     "url": "/jmx?qry=java.lang:type=Memory"},
+      {"name": "mem",     "url": "/jmx?qry=java.lang:type=Memory"}
     ];
 
     var HELPERS = {
@@ -166,14 +166,29 @@
 
   $('#ui-tabs a[href="#tab-snapshot"]').click(load_snapshot_info);
 
-  var hash = window.location.hash;
-  if (hash === "#tab-datanode") {
-    load_datanode_info();
-  } else if (hash === "#tab-snapshot") {
-    load_snapshot_info();
-  } else if (hash === "#tab-startup-progress") {
-    load_startup_progress();
-  } else {
-    load_overview();
+  function load_page() {
+    var hash = window.location.hash;
+    switch(hash) {
+      case "#tab-datanode":
+        load_datanode_info();
+        break;
+      case "#tab-snapshot":
+        load_snapshot_info();
+        break;
+      case "#tab-startup-progress":
+        load_startup_progress();
+        break;
+      case "#tab-overview":
+        load_overview();
+        break;
+      default:
+        window.location.hash = "tab-overview";
+        break;
+    }
   }
+  load_page();
+
+  $(window).bind('hashchange', function () {
+    load_page();
+  });
 })();

+ 2 - 2
hadoop-hdfs-project/hadoop-hdfs/src/main/webapps/hdfs/explorer.html

@@ -1,3 +1,5 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
+    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
 <!--
     Licensed to the Apache Software Foundation (ASF) under one or more
     contributor license agreements.  See the NOTICE file distributed with
@@ -14,8 +16,6 @@
     See the License for the specific language governing permissions and
     limitations under the License.
   -->
-<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
-	  "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
 <html xmlns="http://www.w3.org/1999/xhtml">
   <head>
     <link rel="stylesheet" type="text/css" href="/static/bootstrap-3.0.2/css/bootstrap.min.css" />

+ 21 - 3
hadoop-hdfs-project/hadoop-hdfs/src/main/webapps/hdfs/explorer.js

@@ -35,8 +35,8 @@
       }
 
       if (sticky) {
-	var exec = ((parms.perm % 10) & 1) == 1;
-	res[res.length - 1] = exec ? 't' : 'T';
+        var otherExec = ((ctx.current().permission % 10) & 1) == 1;
+        res = res.substr(0, res.length - 1) + (otherExec ? 't' : 'T');
       }
 
       chunk.write(dir + res);
@@ -52,6 +52,18 @@
     $('#alert-panel').show();
   }
 
+  $(window).bind('hashchange', function () {
+    $('#alert-panel').hide();
+
+    var dir = window.location.hash.slice(1);
+    if(dir == "") {
+      dir = "/";
+    }
+    if(current_directory != dir) {
+      browse_directory(dir);
+    }
+  });
+
   function network_error_handler(url) {
     return function (jqxhr, text, err) {
       var msg = '<p>Failed to retreive data from ' + url + ', cause: ' + err + '</p>';
@@ -145,6 +157,7 @@
 
       current_directory = dir;
       $('#directory').val(dir);
+      window.location.hash = dir;
       dust.render('explorer', base.push(d), function(err, out) {
         $('#panel').html(out);
 
@@ -169,7 +182,12 @@
 
     var b = function() { browse_directory($('#directory').val()); };
     $('#btn-nav-directory').click(b);
-    browse_directory('/');
+    var dir = window.location.hash.slice(1);
+    if(dir == "") {
+      window.location.hash = "/";
+    } else {
+      browse_directory(dir);
+    }
   }
 
   init();

+ 123 - 101
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/fs/TestEnhancedByteBufferAccess.java

@@ -28,32 +28,39 @@ import java.util.EnumSet;
 import java.util.Random;
 
 import org.apache.commons.lang.SystemUtils;
+import org.apache.commons.lang.mutable.MutableBoolean;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.fs.FSDataInputStream;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hdfs.ExtendedBlockId;
+import org.apache.hadoop.hdfs.ClientContext;
+import org.apache.hadoop.hdfs.DFSClient;
 import org.apache.hadoop.hdfs.DFSConfigKeys;
 import org.apache.hadoop.hdfs.DFSTestUtil;
 import org.apache.hadoop.hdfs.DistributedFileSystem;
 import org.apache.hadoop.hdfs.HdfsConfiguration;
 import org.apache.hadoop.hdfs.MiniDFSCluster;
-import org.apache.hadoop.hdfs.client.ClientMmap;
-import org.apache.hadoop.hdfs.client.ClientMmapManager;
 import org.apache.hadoop.hdfs.client.HdfsDataInputStream;
+import org.apache.hadoop.hdfs.client.ShortCircuitCache;
+import org.apache.hadoop.hdfs.client.ShortCircuitCache.CacheVisitor;
+import org.apache.hadoop.hdfs.client.ShortCircuitReplica;
 import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
 import org.apache.hadoop.io.ByteBufferPool;
-import org.apache.hadoop.io.ElasticByteBufferPool;
 import org.apache.hadoop.io.IOUtils;
 import org.apache.hadoop.io.nativeio.NativeIO;
 import org.apache.hadoop.net.unix.DomainSocket;
 import org.apache.hadoop.net.unix.TemporarySocketDirectory;
+import org.apache.hadoop.security.token.SecretManager.InvalidToken;
 import org.apache.hadoop.test.GenericTestUtils;
 import org.junit.Assert;
 import org.junit.Assume;
 import org.junit.BeforeClass;
 import org.junit.Test;
 
+import java.util.Map;
+
 import com.google.common.base.Preconditions;
 import com.google.common.base.Supplier;
 
@@ -250,17 +257,39 @@ public class TestEnhancedByteBufferAccess {
     }
   }
 
-  private static class CountingVisitor
-      implements ClientMmapManager.ClientMmapVisitor {
-    int count = 0;
+  private static class CountingVisitor implements CacheVisitor {
+    private final int expectedNumOutstandingMmaps;
+    private final int expectedNumReplicas;
+    private final int expectedNumEvictable;
+    private final int expectedNumMmapedEvictable;
 
-    @Override
-    public void accept(ClientMmap mmap) {
-      count++;
+    CountingVisitor(int expectedNumOutstandingMmaps,
+        int expectedNumReplicas, int expectedNumEvictable,
+        int expectedNumMmapedEvictable) {
+      this.expectedNumOutstandingMmaps = expectedNumOutstandingMmaps;
+      this.expectedNumReplicas = expectedNumReplicas;
+      this.expectedNumEvictable = expectedNumEvictable;
+      this.expectedNumMmapedEvictable = expectedNumMmapedEvictable;
     }
 
-    public void reset() {
-      count = 0;
+    @Override
+    public void visit(int numOutstandingMmaps,
+        Map<ExtendedBlockId, ShortCircuitReplica> replicas,
+        Map<ExtendedBlockId, InvalidToken> failedLoads,
+        Map<Long, ShortCircuitReplica> evictable,
+        Map<Long, ShortCircuitReplica> evictableMmapped) {
+      if (expectedNumOutstandingMmaps >= 0) {
+        Assert.assertEquals(expectedNumOutstandingMmaps, numOutstandingMmaps);
+      }
+      if (expectedNumReplicas >= 0) {
+        Assert.assertEquals(expectedNumReplicas, replicas.size());
+      }
+      if (expectedNumEvictable >= 0) {
+        Assert.assertEquals(expectedNumEvictable, evictable.size());
+      }
+      if (expectedNumMmapedEvictable >= 0) {
+        Assert.assertEquals(expectedNumMmapedEvictable, evictableMmapped.size());
+      }
     }
   }
 
@@ -271,105 +300,98 @@ public class TestEnhancedByteBufferAccess {
     final Path TEST_PATH = new Path("/a");
     final int TEST_FILE_LENGTH = 16385;
     final int RANDOM_SEED = 23453;
+    final String CONTEXT = "testZeroCopyMmapCacheContext";
     FSDataInputStream fsIn = null;
-    ByteBuffer results[] = { null, null, null, null, null };
-    
+    ByteBuffer results[] = { null, null, null, null };
+
     DistributedFileSystem fs = null;
+    conf.set(DFSConfigKeys.DFS_CLIENT_CONTEXT, CONTEXT);
+    cluster = new MiniDFSCluster.Builder(conf).numDataNodes(1).build();
+    cluster.waitActive();
+    fs = cluster.getFileSystem();
+    DFSTestUtil.createFile(fs, TEST_PATH,
+        TEST_FILE_LENGTH, (short)1, RANDOM_SEED);
     try {
-      cluster = new MiniDFSCluster.Builder(conf).numDataNodes(1).build();
-      cluster.waitActive();
-      fs = cluster.getFileSystem();
-      DFSTestUtil.createFile(fs, TEST_PATH,
-          TEST_FILE_LENGTH, (short)1, RANDOM_SEED);
-      try {
-        DFSTestUtil.waitReplication(fs, TEST_PATH, (short)1);
-      } catch (InterruptedException e) {
-        Assert.fail("unexpected InterruptedException during " +
-            "waitReplication: " + e);
-      } catch (TimeoutException e) {
-        Assert.fail("unexpected TimeoutException during " +
-            "waitReplication: " + e);
-      }
-      fsIn = fs.open(TEST_PATH);
-      byte original[] = new byte[TEST_FILE_LENGTH];
-      IOUtils.readFully(fsIn, original, 0, TEST_FILE_LENGTH);
-      fsIn.close();
-      fsIn = fs.open(TEST_PATH);
-      final ClientMmapManager mmapManager = fs.getClient().getMmapManager();
-      final CountingVisitor countingVisitor = new CountingVisitor();
-      mmapManager.visitMmaps(countingVisitor);
-      Assert.assertEquals(0, countingVisitor.count);
-      mmapManager.visitEvictable(countingVisitor);
-      Assert.assertEquals(0, countingVisitor.count);
-      results[0] = fsIn.read(null, 4096,
-          EnumSet.of(ReadOption.SKIP_CHECKSUMS));
-      fsIn.seek(0);
-      results[1] = fsIn.read(null, 4096,
-          EnumSet.of(ReadOption.SKIP_CHECKSUMS));
-      mmapManager.visitMmaps(countingVisitor);
-      Assert.assertEquals(1, countingVisitor.count);
-      countingVisitor.reset();
-      mmapManager.visitEvictable(countingVisitor);
-      Assert.assertEquals(0, countingVisitor.count);
-      countingVisitor.reset();
-
-      // The mmaps should be of the first block of the file.
-      final ExtendedBlock firstBlock = DFSTestUtil.getFirstBlock(fs, TEST_PATH);
-      mmapManager.visitMmaps(new ClientMmapManager.ClientMmapVisitor() {
-        @Override
-        public void accept(ClientMmap mmap) {
-          Assert.assertEquals(firstBlock, mmap.getBlock());
-        }
-      });
+      DFSTestUtil.waitReplication(fs, TEST_PATH, (short)1);
+    } catch (InterruptedException e) {
+      Assert.fail("unexpected InterruptedException during " +
+          "waitReplication: " + e);
+    } catch (TimeoutException e) {
+      Assert.fail("unexpected TimeoutException during " +
+          "waitReplication: " + e);
+    }
+    fsIn = fs.open(TEST_PATH);
+    byte original[] = new byte[TEST_FILE_LENGTH];
+    IOUtils.readFully(fsIn, original, 0, TEST_FILE_LENGTH);
+    fsIn.close();
+    fsIn = fs.open(TEST_PATH);
+    final ShortCircuitCache cache = ClientContext.get(
+        CONTEXT, new DFSClient.Conf(conf)). getShortCircuitCache();
+    cache.accept(new CountingVisitor(0, 5, 5, 0));
+    results[0] = fsIn.read(null, 4096,
+        EnumSet.of(ReadOption.SKIP_CHECKSUMS));
+    fsIn.seek(0);
+    results[1] = fsIn.read(null, 4096,
+        EnumSet.of(ReadOption.SKIP_CHECKSUMS));
 
-      // Read more blocks.
-      results[2] = fsIn.read(null, 4096,
-          EnumSet.of(ReadOption.SKIP_CHECKSUMS));
-      results[3] = fsIn.read(null, 4096,
-          EnumSet.of(ReadOption.SKIP_CHECKSUMS));
-      try {
-        results[4] = fsIn.read(null, 4096,
-            EnumSet.of(ReadOption.SKIP_CHECKSUMS));
-        Assert.fail("expected UnsupportedOperationException");
-      } catch (UnsupportedOperationException e) {
-        // expected
+    // The mmap should be of the first block of the file.
+    final ExtendedBlock firstBlock =
+        DFSTestUtil.getFirstBlock(fs, TEST_PATH);
+    cache.accept(new CacheVisitor() {
+      @Override
+      public void visit(int numOutstandingMmaps,
+          Map<ExtendedBlockId, ShortCircuitReplica> replicas,
+          Map<ExtendedBlockId, InvalidToken> failedLoads, 
+          Map<Long, ShortCircuitReplica> evictable,
+          Map<Long, ShortCircuitReplica> evictableMmapped) {
+        ShortCircuitReplica replica = replicas.get(
+            new ExtendedBlockId(firstBlock.getBlockId(), firstBlock.getBlockPoolId()));
+        Assert.assertNotNull(replica);
+        Assert.assertTrue(replica.hasMmap());
+        // The replica should not yet be evictable, since we have it open.
+        Assert.assertNull(replica.getEvictableTimeNs());
       }
+    });
+
+    // Read more blocks.
+    results[2] = fsIn.read(null, 4096,
+        EnumSet.of(ReadOption.SKIP_CHECKSUMS));
+    results[3] = fsIn.read(null, 4096,
+        EnumSet.of(ReadOption.SKIP_CHECKSUMS));
 
-      // we should have 3 mmaps, 0 evictable
-      mmapManager.visitMmaps(countingVisitor);
-      Assert.assertEquals(3, countingVisitor.count);
-      countingVisitor.reset();
-      mmapManager.visitEvictable(countingVisitor);
-      Assert.assertEquals(0, countingVisitor.count);
+    // we should have 3 mmaps, 1 evictable
+    cache.accept(new CountingVisitor(3, 5, 2, 0));
 
-      // After we close the cursors, the mmaps should be evictable for 
-      // a brief period of time.  Then, they should be closed (we're 
-      // using a very quick timeout)
-      for (ByteBuffer buffer : results) {
-        if (buffer != null) {
-          fsIn.releaseBuffer(buffer);
-        }
+    // After we close the cursors, the mmaps should be evictable for 
+    // a brief period of time.  Then, they should be closed (we're 
+    // using a very quick timeout)
+    for (ByteBuffer buffer : results) {
+      if (buffer != null) {
+        fsIn.releaseBuffer(buffer);
       }
-      GenericTestUtils.waitFor(new Supplier<Boolean>() {
-        public Boolean get() {
-          countingVisitor.reset();
-          try {
-            mmapManager.visitEvictable(countingVisitor);
-          } catch (InterruptedException e) {
-            e.printStackTrace();
-            return false;
-          }
-          return (0 == countingVisitor.count);
-        }
-      }, 10, 10000);
-      countingVisitor.reset();
-      mmapManager.visitMmaps(countingVisitor);
-      Assert.assertEquals(0, countingVisitor.count);
-    } finally {
-      if (fsIn != null) fsIn.close();
-      if (fs != null) fs.close();
-      if (cluster != null) cluster.shutdown();
     }
+    fsIn.close();
+    GenericTestUtils.waitFor(new Supplier<Boolean>() {
+      public Boolean get() {
+        final MutableBoolean finished = new MutableBoolean(false);
+        cache.accept(new CacheVisitor() {
+          @Override
+          public void visit(int numOutstandingMmaps,
+              Map<ExtendedBlockId, ShortCircuitReplica> replicas,
+              Map<ExtendedBlockId, InvalidToken> failedLoads,
+              Map<Long, ShortCircuitReplica> evictable,
+              Map<Long, ShortCircuitReplica> evictableMmapped) {
+            finished.setValue(evictableMmapped.isEmpty());
+          }
+        });
+        return finished.booleanValue();
+      }
+    }, 10, 60000);
+
+    cache.accept(new CountingVisitor(0, -1, -1, -1));
+    
+    fs.close();
+    cluster.shutdown();
   }
 
   /**

+ 29 - 0
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/fs/TestGlobPaths.java

@@ -21,6 +21,7 @@ import static org.junit.Assert.*;
 
 import java.io.IOException;
 import java.security.PrivilegedExceptionAction;
+import java.util.UUID;
 import java.util.regex.Pattern;
 
 import org.apache.commons.lang.StringUtils;
@@ -1175,4 +1176,32 @@ public class TestGlobPaths {
   public void testReservedHdfsPathsOnFC() throws Exception {
     testOnFileContext(new TestReservedHdfsPaths());
   }
+  
+  /**
+   * Test trying to glob the root.  Regression test for HDFS-5888.
+   **/
+  private static class TestGlobRoot implements FSTestWrapperGlobTest {
+    public void run(FSTestWrapper wrap, FSTestWrapper unprivilegedWrap,
+        FileSystem fs, FileContext fc) throws Exception {
+      final Path rootPath = new Path("/");
+      FileStatus oldRootStatus = wrap.getFileStatus(rootPath);
+      String newOwner = UUID.randomUUID().toString();
+      wrap.setOwner(new Path("/"), newOwner, null);
+      FileStatus[] status = 
+          wrap.globStatus(rootPath, new AcceptAllPathFilter());
+      Assert.assertEquals(1, status.length);
+      Assert.assertEquals(newOwner, status[0].getOwner());
+      wrap.setOwner(new Path("/"), oldRootStatus.getOwner(), null);
+    }
+  }
+
+  @Test
+  public void testGlobRootOnFS() throws Exception {
+    testOnFileSystem(new TestGlobRoot());
+  }
+
+  @Test
+  public void testGlobRootOnFC() throws Exception {
+    testOnFileContext(new TestGlobRoot());
+  }
 }

+ 60 - 13
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/BlockReaderTestUtil.java

@@ -28,8 +28,12 @@ import java.net.Socket;
 import java.util.List;
 import java.util.Random;
 
+import org.apache.commons.io.IOUtils;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hdfs.client.ShortCircuitCache;
+import org.apache.hadoop.hdfs.client.ShortCircuitReplica;
+import org.apache.hadoop.hdfs.net.Peer;
 import org.apache.hadoop.hdfs.net.TcpPeerServer;
 import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
 import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
@@ -38,6 +42,8 @@ import org.apache.hadoop.hdfs.server.common.HdfsServerConstants;
 import org.apache.hadoop.hdfs.server.datanode.CachingStrategy;
 import org.apache.hadoop.hdfs.server.datanode.DataNode;
 import org.apache.hadoop.net.NetUtils;
+import org.apache.log4j.Level;
+import org.apache.log4j.LogManager;
 
 /**
  * A helper class to setup the cluster, and get to BlockReader and DataNode for a block.
@@ -141,22 +147,54 @@ public class BlockReaderTestUtil {
    */
   public BlockReader getBlockReader(LocatedBlock testBlock, int offset, int lenToRead)
       throws IOException {
+    return getBlockReader(cluster, testBlock, offset, lenToRead);
+  }
+
+  /**
+   * Get a BlockReader for the given block.
+   */
+  public static BlockReader getBlockReader(MiniDFSCluster cluster,
+      LocatedBlock testBlock, int offset, int lenToRead) throws IOException {
     InetSocketAddress targetAddr = null;
-    Socket sock = null;
     ExtendedBlock block = testBlock.getBlock();
     DatanodeInfo[] nodes = testBlock.getLocations();
     targetAddr = NetUtils.createSocketAddr(nodes[0].getXferAddr());
-    sock = NetUtils.getDefaultSocketFactory(conf).createSocket();
-    sock.connect(targetAddr, HdfsServerConstants.READ_TIMEOUT);
-    sock.setSoTimeout(HdfsServerConstants.READ_TIMEOUT);
-
-    return BlockReaderFactory.newBlockReader(
-      new DFSClient.Conf(conf),
-      targetAddr.toString()+ ":" + block.getBlockId(), block,
-      testBlock.getBlockToken(), 
-      offset, lenToRead,
-      true, "BlockReaderTestUtil", TcpPeerServer.peerFromSocket(sock),
-      nodes[0], null, null, null, false, CachingStrategy.newDefaultStrategy());
+
+    final DistributedFileSystem fs = cluster.getFileSystem();
+    return new BlockReaderFactory(fs.getClient().getConf()).
+      setInetSocketAddress(targetAddr).
+      setBlock(block).
+      setFileName(targetAddr.toString()+ ":" + block.getBlockId()).
+      setBlockToken(testBlock.getBlockToken()).
+      setStartOffset(offset).
+      setLength(lenToRead).
+      setVerifyChecksum(true).
+      setClientName("BlockReaderTestUtil").
+      setDatanodeInfo(nodes[0]).
+      setClientCacheContext(ClientContext.getFromConf(fs.getConf())).
+      setCachingStrategy(CachingStrategy.newDefaultStrategy()).
+      setConfiguration(fs.getConf()).
+      setAllowShortCircuitLocalReads(true).
+      setRemotePeerFactory(new RemotePeerFactory() {
+        @Override
+        public Peer newConnectedPeer(InetSocketAddress addr)
+            throws IOException {
+          Peer peer = null;
+          Socket sock = NetUtils.
+              getDefaultSocketFactory(fs.getConf()).createSocket();
+          try {
+            sock.connect(addr, HdfsServerConstants.READ_TIMEOUT);
+            sock.setSoTimeout(HdfsServerConstants.READ_TIMEOUT);
+            peer = TcpPeerServer.peerFromSocket(sock);
+          } finally {
+            if (peer == null) {
+              IOUtils.closeQuietly(sock);
+            }
+          }
+          return peer;
+        }
+      }).
+      build();
   }
 
   /**
@@ -167,4 +205,13 @@ public class BlockReaderTestUtil {
     int ipcport = nodes[0].getIpcPort();
     return cluster.getDataNode(ipcport);
   }
-}
+  
+  public static void enableBlockReaderFactoryTracing() {
+    LogManager.getLogger(BlockReaderFactory.class.getName()).setLevel(
+        Level.TRACE);
+    LogManager.getLogger(ShortCircuitCache.class.getName()).setLevel(
+        Level.TRACE);
+    LogManager.getLogger(ShortCircuitReplica.class.getName()).setLevel(
+        Level.TRACE);
+  }
+}

+ 32 - 5
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/DFSTestUtil.java

@@ -186,10 +186,26 @@ public class DFSTestUtil {
     }
   }
 
-  public static String readFile(FileSystem fs, Path fileName) throws IOException {
+  public static String readFile(FileSystem fs, Path fileName)
+      throws IOException {
+    byte buf[] = readFileBuffer(fs, fileName);
+	return new String(buf, 0, buf.length);
+  }
+
+  public static byte[] readFileBuffer(FileSystem fs, Path fileName) 
+      throws IOException {
     ByteArrayOutputStream os = new ByteArrayOutputStream();
-    IOUtils.copyBytes(fs.open(fileName), os, 1024, true);
-    return os.toString();
+    try {
+      FSDataInputStream in = fs.open(fileName);
+      try {
+        IOUtils.copyBytes(fs.open(fileName), os, 1024, true);
+        return os.toByteArray();
+      } finally {
+        in.close();
+      }
+    } finally {
+      os.close();
+    }
   }
   
   public static void createFile(FileSystem fs, Path fileName, long fileLen, 
@@ -231,6 +247,13 @@ public class DFSTestUtil {
     }
   }
   
+  public static byte[] calculateFileContentsFromSeed(long seed, int length) {
+    Random rb = new Random(seed);
+    byte val[] = new byte[length];
+    rb.nextBytes(val);
+    return val;
+  }
+  
   /** check if the files have been copied correctly. */
   public boolean checkFiles(FileSystem fs, String topdir) throws IOException {
     Path root = new Path(topdir);
@@ -550,8 +573,12 @@ public class DFSTestUtil {
   
   public static ExtendedBlock getFirstBlock(FileSystem fs, Path path) throws IOException {
     HdfsDataInputStream in = (HdfsDataInputStream) fs.open(path);
-    in.readByte();
-    return in.getCurrentBlock();
+    try {
+      in.readByte();
+      return in.getCurrentBlock();
+    } finally {
+      in.close();
+    }
   }  
 
   public static List<LocatedBlock> getAllBlocks(FSDataInputStream in)

+ 285 - 0
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestBlockReaderFactory.java

@@ -0,0 +1,285 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs;
+
+import java.io.File;
+import java.util.Arrays;
+import java.util.List;
+import java.util.concurrent.CountDownLatch;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.TimeUnit;
+
+import org.apache.commons.logging.LogFactory;
+import org.apache.commons.logging.Log;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hdfs.client.ShortCircuitCache;
+import org.apache.hadoop.hdfs.client.ShortCircuitReplicaInfo;
+import org.apache.hadoop.hdfs.protocol.LocatedBlock;
+import org.apache.hadoop.net.unix.DomainSocket;
+import org.apache.hadoop.net.unix.TemporarySocketDirectory;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import com.google.common.util.concurrent.Uninterruptibles;
+
+import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_BLOCK_SIZE_KEY;
+import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CLIENT_CONTEXT;
+import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DOMAIN_SOCKET_PATH_KEY;
+import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_KEY;
+import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_SKIP_CHECKSUM_KEY;
+import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CLIENT_DOMAIN_SOCKET_DATA_TRAFFIC;
+
+public class TestBlockReaderFactory {
+  static final Log LOG = LogFactory.getLog(TestBlockReaderFactory.class);
+
+  @Before
+  public void init() {
+    DomainSocket.disableBindPathValidation();
+  }
+
+  @After
+  public void cleanup() {
+    DFSInputStream.tcpReadsDisabledForTesting = false;
+    BlockReaderFactory.createShortCircuitReplicaInfoCallback = null;
+  }
+
+  private static Configuration createShortCircuitConf(String testName,
+      TemporarySocketDirectory sockDir) {
+    Configuration conf = new Configuration();
+    conf.set(DFS_CLIENT_CONTEXT, testName);
+    conf.setLong(DFS_BLOCK_SIZE_KEY, 4096);
+    conf.set(DFS_DOMAIN_SOCKET_PATH_KEY, new File(sockDir.getDir(),
+        testName + "._PORT").getAbsolutePath());
+    conf.setBoolean(DFS_CLIENT_READ_SHORTCIRCUIT_KEY, true);
+    conf.setBoolean(DFS_CLIENT_READ_SHORTCIRCUIT_SKIP_CHECKSUM_KEY,
+        false);
+    conf.setBoolean(DFS_CLIENT_DOMAIN_SOCKET_DATA_TRAFFIC, false);
+    return conf;
+  }
+
+  /**
+   * If we have a UNIX domain socket configured,
+   * and we have dfs.client.domain.socket.data.traffic set to true,
+   * and short-circuit access fails, we should still be able to pass
+   * data traffic over the UNIX domain socket.  Test this.
+   */
+  @Test(timeout=60000)
+  public void testFallbackFromShortCircuitToUnixDomainTraffic()
+      throws Exception {
+    DFSInputStream.tcpReadsDisabledForTesting = true;
+    TemporarySocketDirectory sockDir = new TemporarySocketDirectory();
+
+    // The server is NOT configured with short-circuit local reads;
+    // the client is.  Both support UNIX domain reads.
+    Configuration clientConf = createShortCircuitConf(
+        "testFallbackFromShortCircuitToUnixDomainTraffic", sockDir);
+    clientConf.setBoolean(DFS_CLIENT_DOMAIN_SOCKET_DATA_TRAFFIC, true);
+    Configuration serverConf = new Configuration(clientConf);
+    serverConf.setBoolean(DFS_CLIENT_READ_SHORTCIRCUIT_KEY, false);
+
+    MiniDFSCluster cluster =
+        new MiniDFSCluster.Builder(serverConf).numDataNodes(1).build();
+    cluster.waitActive();
+    FileSystem dfs = FileSystem.get(cluster.getURI(0), clientConf);
+    String TEST_FILE = "/test_file";
+    final int TEST_FILE_LEN = 8193;
+    final int SEED = 0xFADED;
+    DFSTestUtil.createFile(dfs, new Path(TEST_FILE), TEST_FILE_LEN,
+        (short)1, SEED);
+    byte contents[] = DFSTestUtil.readFileBuffer(dfs, new Path(TEST_FILE));
+    byte expected[] = DFSTestUtil.
+        calculateFileContentsFromSeed(SEED, TEST_FILE_LEN);
+    Assert.assertTrue(Arrays.equals(contents, expected));
+    cluster.shutdown();
+    sockDir.close();
+  }
+  
+  /**
+   * Test the case where we have multiple threads waiting on the
+   * ShortCircuitCache delivering a certain ShortCircuitReplica.
+   *
+   * In this case, there should only be one call to
+   * createShortCircuitReplicaInfo.  This one replica should be shared
+   * by all threads.
+   */
+  @Test(timeout=60000)
+  public void testMultipleWaitersOnShortCircuitCache()
+      throws Exception {
+    final CountDownLatch latch = new CountDownLatch(1);
+    final AtomicBoolean creationIsBlocked = new AtomicBoolean(true);
+    final AtomicBoolean testFailed = new AtomicBoolean(false);
+    DFSInputStream.tcpReadsDisabledForTesting = true;
+    BlockReaderFactory.createShortCircuitReplicaInfoCallback =
+      new ShortCircuitCache.ShortCircuitReplicaCreator() {
+        @Override
+        public ShortCircuitReplicaInfo createShortCircuitReplicaInfo() {
+          Uninterruptibles.awaitUninterruptibly(latch);
+          if (!creationIsBlocked.compareAndSet(true, false)) {
+            Assert.fail("there were multiple calls to "
+                + "createShortCircuitReplicaInfo.  Only one was expected.");
+          }
+          return null;
+        }
+      };
+    TemporarySocketDirectory sockDir = new TemporarySocketDirectory();
+    Configuration conf = createShortCircuitConf(
+        "testMultipleWaitersOnShortCircuitCache", sockDir);
+    MiniDFSCluster cluster =
+        new MiniDFSCluster.Builder(conf).numDataNodes(1).build();
+    cluster.waitActive();
+    final DistributedFileSystem dfs = cluster.getFileSystem();
+    final String TEST_FILE = "/test_file";
+    final int TEST_FILE_LEN = 4000;
+    final int SEED = 0xFADED;
+    final int NUM_THREADS = 10;
+    DFSTestUtil.createFile(dfs, new Path(TEST_FILE), TEST_FILE_LEN,
+        (short)1, SEED);
+    Runnable readerRunnable = new Runnable() {
+      @Override
+      public void run() {
+        try {
+          byte contents[] = DFSTestUtil.readFileBuffer(dfs, new Path(TEST_FILE));
+          Assert.assertFalse(creationIsBlocked.get());
+          byte expected[] = DFSTestUtil.
+              calculateFileContentsFromSeed(SEED, TEST_FILE_LEN);
+          Assert.assertTrue(Arrays.equals(contents, expected));
+        } catch (Throwable e) {
+          LOG.error("readerRunnable error", e);
+          testFailed.set(true);
+        }
+      }
+    };
+    Thread threads[] = new Thread[NUM_THREADS];
+    for (int i = 0; i < NUM_THREADS; i++) {
+      threads[i] = new Thread(readerRunnable);
+      threads[i].start();
+    }
+    Thread.sleep(500);
+    latch.countDown();
+    for (int i = 0; i < NUM_THREADS; i++) {
+      Uninterruptibles.joinUninterruptibly(threads[i]);
+    }
+    cluster.shutdown();
+    sockDir.close();
+    Assert.assertFalse(testFailed.get());
+  }
+
+  /**
+   * Test the case where we have a failure to complete a short circuit read
+   * that occurs, and then later on, we have a success.
+   * Any thread waiting on a cache load should receive the failure (if it
+   * occurs);  however, the failure result should not be cached.  We want 
+   * to be able to retry later and succeed.
+   */
+  @Test(timeout=60000)
+  public void testShortCircuitCacheTemporaryFailure()
+      throws Exception {
+    BlockReaderTestUtil.enableBlockReaderFactoryTracing();
+    final AtomicBoolean replicaCreationShouldFail = new AtomicBoolean(true);
+    final AtomicBoolean testFailed = new AtomicBoolean(false);
+    DFSInputStream.tcpReadsDisabledForTesting = true;
+    BlockReaderFactory.createShortCircuitReplicaInfoCallback =
+      new ShortCircuitCache.ShortCircuitReplicaCreator() {
+        @Override
+        public ShortCircuitReplicaInfo createShortCircuitReplicaInfo() {
+          if (replicaCreationShouldFail.get()) {
+            // Insert a short delay to increase the chance that one client
+            // thread waits for the other client thread's failure via
+            // a condition variable.
+            Uninterruptibles.sleepUninterruptibly(2, TimeUnit.SECONDS);
+            return new ShortCircuitReplicaInfo();
+          }
+          return null;
+        }
+      };
+    TemporarySocketDirectory sockDir = new TemporarySocketDirectory();
+    Configuration conf = createShortCircuitConf(
+        "testShortCircuitCacheTemporaryFailure", sockDir);
+    final MiniDFSCluster cluster =
+        new MiniDFSCluster.Builder(conf).numDataNodes(1).build();
+    cluster.waitActive();
+    final DistributedFileSystem dfs = cluster.getFileSystem();
+    final String TEST_FILE = "/test_file";
+    final int TEST_FILE_LEN = 4000;
+    final int NUM_THREADS = 2;
+    final int SEED = 0xFADED;
+    final CountDownLatch gotFailureLatch = new CountDownLatch(NUM_THREADS);
+    final CountDownLatch shouldRetryLatch = new CountDownLatch(1);
+    DFSTestUtil.createFile(dfs, new Path(TEST_FILE), TEST_FILE_LEN,
+        (short)1, SEED);
+    Runnable readerRunnable = new Runnable() {
+      @Override
+      public void run() {
+        try {
+          // First time should fail.
+          List<LocatedBlock> locatedBlocks = 
+              cluster.getNameNode().getRpcServer().getBlockLocations(
+              TEST_FILE, 0, TEST_FILE_LEN).getLocatedBlocks();
+          LocatedBlock lblock = locatedBlocks.get(0); // first block
+          BlockReader blockReader = null;
+          try {
+            blockReader = BlockReaderTestUtil.
+                getBlockReader(cluster, lblock, 0, TEST_FILE_LEN);
+            Assert.fail("expected getBlockReader to fail the first time.");
+          } catch (Throwable t) { 
+            Assert.assertTrue("expected to see 'TCP reads were disabled " +
+                "for testing' in exception " + t, t.getMessage().contains(
+                "TCP reads were disabled for testing"));
+          } finally {
+            if (blockReader != null) blockReader.close(); // keep findbugs happy
+          }
+          gotFailureLatch.countDown();
+          shouldRetryLatch.await();
+
+          // Second time should succeed.
+          try {
+            blockReader = BlockReaderTestUtil.
+                getBlockReader(cluster, lblock, 0, TEST_FILE_LEN);
+          } catch (Throwable t) { 
+            LOG.error("error trying to retrieve a block reader " +
+                "the second time.", t);
+            throw t;
+          } finally {
+            if (blockReader != null) blockReader.close();
+          }
+        } catch (Throwable t) {
+          LOG.error("getBlockReader failure", t);
+          testFailed.set(true);
+        }
+      }
+    };
+    Thread threads[] = new Thread[NUM_THREADS];
+    for (int i = 0; i < NUM_THREADS; i++) {
+      threads[i] = new Thread(readerRunnable);
+      threads[i].start();
+    }
+    gotFailureLatch.await();
+    replicaCreationShouldFail.set(false);
+    shouldRetryLatch.countDown();
+    for (int i = 0; i < NUM_THREADS; i++) {
+      Uninterruptibles.joinUninterruptibly(threads[i]);
+    }
+    cluster.shutdown();
+    sockDir.close();
+    Assert.assertFalse(testFailed.get());
+  }
+}

+ 9 - 4
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestBlockReaderLocal.java

@@ -30,13 +30,15 @@ import org.apache.hadoop.fs.FSDataInputStream;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.hdfs.client.HdfsDataInputStream;
+import org.apache.hadoop.hdfs.client.ShortCircuitCache;
+import org.apache.hadoop.hdfs.client.ShortCircuitReplica;
 import org.apache.hadoop.hdfs.protocol.DatanodeID;
 import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
-import org.apache.hadoop.hdfs.server.datanode.BlockMetadataHeader;
 import org.apache.hadoop.hdfs.server.datanode.CachingStrategy;
 import org.apache.hadoop.io.IOUtils;
 import org.apache.hadoop.net.unix.DomainSocket;
 import org.apache.hadoop.net.unix.TemporarySocketDirectory;
+import org.apache.hadoop.util.Time;
 import org.junit.AfterClass;
 import org.junit.Assert;
 import org.junit.Assume;
@@ -155,6 +157,8 @@ public class TestBlockReaderLocal {
       File metaFile = MiniDFSCluster.getBlockMetadataFile(0, block);
 
       DatanodeID datanodeID = cluster.getDataNodes().get(0).getDatanodeId();
+      ShortCircuitCache shortCircuitCache =
+          ClientContext.getFromConf(conf).getShortCircuitCache();
       cluster.shutdown();
       cluster = null;
       test.setup(dataFile, checksum);
@@ -164,16 +168,17 @@ public class TestBlockReaderLocal {
       };
       dataIn = streams[0];
       metaIn = streams[1];
+      ExtendedBlockId key = new ExtendedBlockId(block.getBlockId(), block.getBlockPoolId());
+      ShortCircuitReplica replica = new ShortCircuitReplica(
+          key, dataIn, metaIn, shortCircuitCache, Time.now());
       blockReaderLocal = new BlockReaderLocal.Builder(
               new DFSClient.Conf(conf)).
           setFilename(TEST_PATH.getName()).
           setBlock(block).
-          setStreams(streams).
+          setShortCircuitReplica(replica).
           setDatanodeID(datanodeID).
           setCachingStrategy(new CachingStrategy(false, readahead)).
           setVerifyChecksum(checksum).
-          setBlockMetadataHeader(BlockMetadataHeader.preadHeader(
-              metaIn.getChannel())).
           build();
       dataIn = null;
       metaIn = null;

+ 17 - 52
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestConnCache.java

@@ -25,18 +25,8 @@ import java.net.InetSocketAddress;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
-import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
-import org.apache.hadoop.hdfs.security.token.block.BlockTokenIdentifier;
-import org.apache.hadoop.hdfs.server.datanode.CachingStrategy;
-import org.apache.hadoop.hdfs.net.Peer;
-import org.apache.hadoop.security.token.Token;
 import org.junit.Assert;
 import org.junit.Test;
-import org.mockito.Matchers;
-import org.mockito.Mockito;
-import org.mockito.invocation.InvocationOnMock;
-import org.mockito.stubbing.Answer;
 
 /**
  * This class tests the client connection caching in a single node
@@ -48,30 +38,6 @@ public class TestConnCache {
   static final int BLOCK_SIZE = 4096;
   static final int FILE_SIZE = 3 * BLOCK_SIZE;
 
-  /**
-   * A mock Answer to remember the BlockReader used.
-   *
-   * It verifies that all invocation to DFSInputStream.getBlockReader()
-   * use the same peer.
-   */
-  private class MockGetBlockReader implements Answer<RemoteBlockReader2> {
-    public RemoteBlockReader2 reader = null;
-    private Peer peer = null;
-
-    @Override
-    public RemoteBlockReader2 answer(InvocationOnMock invocation) throws Throwable {
-      RemoteBlockReader2 prevReader = reader;
-      reader = (RemoteBlockReader2) invocation.callRealMethod();
-      if (peer == null) {
-        peer = reader.getPeer();
-      } else if (prevReader != null) {
-        Assert.assertSame("DFSInputStream should use the same peer",
-                   peer, reader.getPeer());
-      }
-      return reader;
-    }
-  }
-
   /**
    * (Optionally) seek to position, read and verify data.
    *
@@ -115,33 +81,29 @@ public class TestConnCache {
    * @throws Exception 
    */
   @Test
-  @SuppressWarnings("unchecked")
   public void testReadFromOneDN() throws Exception {
-    BlockReaderTestUtil util = new BlockReaderTestUtil(1,
-        new HdfsConfiguration());
+    HdfsConfiguration configuration = new HdfsConfiguration();
+    // One of the goals of this test is to verify that we don't open more
+    // than one socket.  So use a different client context, so that we
+    // get our own socket cache, rather than sharing with the other test 
+    // instances.  Also use a really long socket timeout so that nothing
+    // gets closed before we get around to checking the cache size at the end.
+    final String contextName = "testReadFromOneDNContext";
+    configuration.set(DFSConfigKeys.DFS_CLIENT_CONTEXT, contextName);
+    configuration.setLong(DFSConfigKeys.DFS_CLIENT_SOCKET_TIMEOUT_KEY,
+        100000000L);
+    BlockReaderTestUtil util = new BlockReaderTestUtil(1, configuration);
     final Path testFile = new Path("/testConnCache.dat");
     byte authenticData[] = util.writeFile(testFile, FILE_SIZE / 1024);
     DFSClient client = new DFSClient(
         new InetSocketAddress("localhost",
             util.getCluster().getNameNodePort()), util.getConf());
-    DFSInputStream in = Mockito.spy(client.open(testFile.toString()));
+    ClientContext cacheContext =
+        ClientContext.get(contextName, client.getConf());
+    DFSInputStream in = client.open(testFile.toString());
     LOG.info("opened " + testFile.toString());
     byte[] dataBuf = new byte[BLOCK_SIZE];
 
-    MockGetBlockReader answer = new MockGetBlockReader();
-    Mockito.doAnswer(answer).when(in).getBlockReader(
-                           (InetSocketAddress) Matchers.anyObject(),
-                           (DatanodeInfo) Matchers.anyObject(),
-                           Matchers.anyString(),
-                           (ExtendedBlock) Matchers.anyObject(),
-                           (Token<BlockTokenIdentifier>) Matchers.anyObject(),
-                           Matchers.anyLong(),
-                           Matchers.anyLong(),
-                           Matchers.anyInt(),
-                           Matchers.anyBoolean(),
-                           Matchers.anyString(),
-                           (CachingStrategy)Matchers.anyObject());
-
     // Initial read
     pread(in, 0, dataBuf, 0, dataBuf.length, authenticData);
     // Read again and verify that the socket is the same
@@ -153,5 +115,8 @@ public class TestConnCache {
     pread(in, 64, dataBuf, 0, dataBuf.length / 2, authenticData);
 
     in.close();
+    client.close();
+    Assert.assertEquals(1,
+        ClientContext.getFromConf(configuration).getPeerCache().size());
   }
 }

+ 22 - 20
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDataTransferKeepalive.java

@@ -22,7 +22,7 @@ import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_SOCKET_REUSE_KEE
 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_SOCKET_REUSE_KEEPALIVE_DEFAULT;
 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_SOCKET_WRITE_TIMEOUT_KEY;
 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CLIENT_SOCKET_CACHE_EXPIRY_MSEC_KEY;
-import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CLIENT_SOCKET_CACHE_CAPACITY_DEFAULT;
+import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CLIENT_CONTEXT;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertNotNull;
 import static org.junit.Assert.assertTrue;
@@ -86,21 +86,22 @@ public class TestDataTransferKeepalive {
     // the datanode-side expiration time.
     final long CLIENT_EXPIRY_MS = 60000L;
     clientConf.setLong(DFS_CLIENT_SOCKET_CACHE_EXPIRY_MSEC_KEY, CLIENT_EXPIRY_MS);
-    PeerCache.setInstance(DFS_CLIENT_SOCKET_CACHE_CAPACITY_DEFAULT, CLIENT_EXPIRY_MS);
+    clientConf.set(DFS_CLIENT_CONTEXT, "testDatanodeRespectsKeepAliveTimeout");
     DistributedFileSystem fs =
         (DistributedFileSystem)FileSystem.get(cluster.getURI(),
             clientConf);
+    PeerCache peerCache = ClientContext.getFromConf(clientConf).getPeerCache();
 
     DFSTestUtil.createFile(fs, TEST_FILE, 1L, (short)1, 0L);
 
     // Clients that write aren't currently re-used.
-    assertEquals(0, fs.dfs.peerCache.size());
+    assertEquals(0, peerCache.size());
     assertXceiverCount(0);
 
     // Reads the file, so we should get a
     // cached socket, and should have an xceiver on the other side.
     DFSTestUtil.readFile(fs, TEST_FILE);
-    assertEquals(1, fs.dfs.peerCache.size());
+    assertEquals(1, peerCache.size());
     assertXceiverCount(1);
 
     // Sleep for a bit longer than the keepalive timeout
@@ -111,15 +112,13 @@ public class TestDataTransferKeepalive {
     // The socket is still in the cache, because we don't
     // notice that it's closed until we try to read
     // from it again.
-    assertEquals(1, fs.dfs.peerCache.size());
+    assertEquals(1, peerCache.size());
     
     // Take it out of the cache - reading should
     // give an EOF.
-    Peer peer = fs.dfs.peerCache.get(dn.getDatanodeId(), false);
+    Peer peer = peerCache.get(dn.getDatanodeId(), false);
     assertNotNull(peer);
     assertEquals(-1, peer.getInputStream().read());
-    PeerCache.setInstance(DFS_CLIENT_SOCKET_CACHE_CAPACITY_DEFAULT,
-        DFS_DATANODE_SOCKET_REUSE_KEEPALIVE_DEFAULT);
   }
 
   /**
@@ -132,34 +131,33 @@ public class TestDataTransferKeepalive {
     // the datanode-side expiration time.
     final long CLIENT_EXPIRY_MS = 10L;
     clientConf.setLong(DFS_CLIENT_SOCKET_CACHE_EXPIRY_MSEC_KEY, CLIENT_EXPIRY_MS);
-    PeerCache.setInstance(DFS_CLIENT_SOCKET_CACHE_CAPACITY_DEFAULT, CLIENT_EXPIRY_MS);
+    clientConf.set(DFS_CLIENT_CONTEXT, "testClientResponsesKeepAliveTimeout");
     DistributedFileSystem fs =
         (DistributedFileSystem)FileSystem.get(cluster.getURI(),
             clientConf);
+    PeerCache peerCache = ClientContext.getFromConf(clientConf).getPeerCache();
 
     DFSTestUtil.createFile(fs, TEST_FILE, 1L, (short)1, 0L);
 
     // Clients that write aren't currently re-used.
-    assertEquals(0, fs.dfs.peerCache.size());
+    assertEquals(0, peerCache.size());
     assertXceiverCount(0);
 
     // Reads the file, so we should get a
     // cached socket, and should have an xceiver on the other side.
     DFSTestUtil.readFile(fs, TEST_FILE);
-    assertEquals(1, fs.dfs.peerCache.size());
+    assertEquals(1, peerCache.size());
     assertXceiverCount(1);
 
     // Sleep for a bit longer than the client keepalive timeout.
     Thread.sleep(CLIENT_EXPIRY_MS + 1);
     
     // Taking out a peer which is expired should give a null.
-    Peer peer = fs.dfs.peerCache.get(dn.getDatanodeId(), false);
+    Peer peer = peerCache.get(dn.getDatanodeId(), false);
     assertTrue(peer == null);
 
     // The socket cache is now empty.
-    assertEquals(0, fs.dfs.peerCache.size());
-    PeerCache.setInstance(DFS_CLIENT_SOCKET_CACHE_CAPACITY_DEFAULT,
-        DFS_DATANODE_SOCKET_REUSE_KEEPALIVE_DEFAULT);
+    assertEquals(0, peerCache.size());
   }
 
   /**
@@ -174,7 +172,7 @@ public class TestDataTransferKeepalive {
     final long CLIENT_EXPIRY_MS = 600000L;
     Configuration clientConf = new Configuration(conf);
     clientConf.setLong(DFS_CLIENT_SOCKET_CACHE_EXPIRY_MSEC_KEY, CLIENT_EXPIRY_MS);
-    PeerCache.setInstance(DFS_CLIENT_SOCKET_CACHE_CAPACITY_DEFAULT, CLIENT_EXPIRY_MS);
+    clientConf.set(DFS_CLIENT_CONTEXT, "testSlowReader");
     DistributedFileSystem fs =
         (DistributedFileSystem)FileSystem.get(cluster.getURI(),
             clientConf);
@@ -209,7 +207,12 @@ public class TestDataTransferKeepalive {
   @Test(timeout=30000)
   public void testManyClosedSocketsInCache() throws Exception {
     // Make a small file
-    DistributedFileSystem fs = cluster.getFileSystem();
+    Configuration clientConf = new Configuration(conf);
+    clientConf.set(DFS_CLIENT_CONTEXT, "testManyClosedSocketsInCache");
+    DistributedFileSystem fs =
+        (DistributedFileSystem)FileSystem.get(cluster.getURI(),
+            clientConf);
+    PeerCache peerCache = ClientContext.getFromConf(clientConf).getPeerCache();
     DFSTestUtil.createFile(fs, TEST_FILE, 1L, (short)1, 0L);
 
     // Insert a bunch of dead sockets in the cache, by opening
@@ -227,15 +230,14 @@ public class TestDataTransferKeepalive {
       IOUtils.cleanup(null, stms);
     }
     
-    DFSClient client = ((DistributedFileSystem)fs).dfs;
-    assertEquals(5, client.peerCache.size());
+    assertEquals(5, peerCache.size());
     
     // Let all the xceivers timeout
     Thread.sleep(1500);
     assertXceiverCount(0);
 
     // Client side still has the sockets cached
-    assertEquals(5, client.peerCache.size());
+    assertEquals(5, peerCache.size());
 
     // Reading should not throw an exception.
     DFSTestUtil.readFile(fs, TEST_FILE);

+ 2 - 1
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDisableConnCache.java

@@ -53,7 +53,8 @@ public class TestDisableConnCache {
     FileSystem fsWithoutCache = FileSystem.newInstance(util.getConf());
     try {
       DFSTestUtil.readFile(fsWithoutCache, testFile);
-      assertEquals(0, ((DistributedFileSystem)fsWithoutCache).dfs.peerCache.size());
+      assertEquals(0, ((DistributedFileSystem)fsWithoutCache).
+          dfs.getClientContext().getPeerCache().size());
     } finally {
       fsWithoutCache.close();
       util.shutdown();

+ 0 - 126
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestFileInputStreamCache.java

@@ -1,126 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.hadoop.hdfs;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.hadoop.hdfs.protocol.DatanodeID;
-import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
-import org.apache.hadoop.io.IOUtils;
-import org.apache.hadoop.net.unix.TemporarySocketDirectory;
-import org.junit.Assert;
-import org.junit.Test;
-
-import java.io.FileInputStream;
-import java.io.FileOutputStream;
-import java.io.IOException;
-
-public class TestFileInputStreamCache {
-  static final Log LOG = LogFactory.getLog(TestFileInputStreamCache.class);
-
-  @Test
-  public void testCreateAndDestroy() throws Exception {
-    FileInputStreamCache cache = new FileInputStreamCache(10, 1000);
-    cache.close();
-  }
-  
-  private static class TestFileDescriptorPair {
-    TemporarySocketDirectory dir = new TemporarySocketDirectory();
-    FileInputStream fis[];
-
-    public TestFileDescriptorPair() throws IOException {
-      fis = new FileInputStream[2];
-      for (int i = 0; i < 2; i++) {
-        String name = dir.getDir() + "/file" + i;
-        FileOutputStream fos = new FileOutputStream(name);
-        fos.write(1);
-        fos.close();
-        fis[i] = new FileInputStream(name);
-      }
-    }
-
-    public FileInputStream[] getFileInputStreams() {
-      return fis;
-    }
-
-    public void close() throws IOException {
-      IOUtils.cleanup(LOG, fis);
-      dir.close();
-    }
-
-    public boolean compareWith(FileInputStream other[]) {
-      if ((other == null) || (fis == null)) {
-        return other == fis;
-      }
-      if (fis.length != other.length) return false;
-      for (int i = 0; i < fis.length; i++) {
-        if (fis[i] != other[i]) return false;
-      }
-      return true;
-    }
-  }
-
-  @Test
-  public void testAddAndRetrieve() throws Exception {
-    FileInputStreamCache cache = new FileInputStreamCache(1, 1000000);
-    DatanodeID dnId = new DatanodeID("127.0.0.1", "localhost", 
-        "xyzzy", 8080, 9090, 7070, 6060);
-    ExtendedBlock block = new ExtendedBlock("poolid", 123);
-    TestFileDescriptorPair pair = new TestFileDescriptorPair();
-    cache.put(dnId, block, pair.getFileInputStreams());
-    FileInputStream fis[] = cache.get(dnId, block);
-    Assert.assertTrue(pair.compareWith(fis));
-    pair.close();
-    cache.close();
-  }
-
-  @Test
-  public void testExpiry() throws Exception {
-    FileInputStreamCache cache = new FileInputStreamCache(1, 10);
-    DatanodeID dnId = new DatanodeID("127.0.0.1", "localhost", 
-        "xyzzy", 8080, 9090, 7070, 6060);
-    ExtendedBlock block = new ExtendedBlock("poolid", 123);
-    TestFileDescriptorPair pair = new TestFileDescriptorPair();
-    cache.put(dnId, block, pair.getFileInputStreams());
-    Thread.sleep(cache.getExpiryTimeMs() * 100);
-    FileInputStream fis[] = cache.get(dnId, block);
-    Assert.assertNull(fis);
-    pair.close();
-    cache.close();
-  }
-
-  @Test
-  public void testEviction() throws Exception {
-    FileInputStreamCache cache = new FileInputStreamCache(1, 10000000);
-    DatanodeID dnId = new DatanodeID("127.0.0.1", "localhost", 
-        "xyzzy", 8080, 9090, 7070, 6060);
-    ExtendedBlock block = new ExtendedBlock("poolid", 123);
-    TestFileDescriptorPair pair = new TestFileDescriptorPair();
-    cache.put(dnId, block, pair.getFileInputStreams());
-    DatanodeID dnId2 = new DatanodeID("127.0.0.1", "localhost", 
-        "xyzzy", 8081, 9091, 7071, 6061);
-    TestFileDescriptorPair pair2 = new TestFileDescriptorPair();
-    cache.put(dnId2, block, pair2.getFileInputStreams());
-    FileInputStream fis[] = cache.get(dnId, block);
-    Assert.assertNull(fis);
-    FileInputStream fis2[] = cache.get(dnId2, block);
-    Assert.assertTrue(pair2.compareWith(fis2));
-    pair.close();
-    cache.close();
-  }
-}

+ 1 - 0
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestFileStatus.java

@@ -303,5 +303,6 @@ public class TestFileStatus {
         FileSystem.LOG.info("GOOD: getting an exception", ioe);
       }
     }
+    fs.delete(dir, true);
   }
 }

+ 346 - 0
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestShortCircuitCache.java

@@ -0,0 +1,346 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs;
+
+import org.apache.commons.lang.mutable.MutableBoolean;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.hdfs.client.ShortCircuitCache;
+import org.apache.hadoop.hdfs.client.ShortCircuitCache.ShortCircuitReplicaCreator;
+import org.apache.hadoop.hdfs.client.ShortCircuitReplica;
+import org.apache.hadoop.hdfs.client.ShortCircuitReplicaInfo;
+import org.apache.hadoop.hdfs.server.datanode.BlockMetadataHeader;
+import org.apache.hadoop.io.IOUtils;
+import org.apache.hadoop.net.unix.TemporarySocketDirectory;
+import org.apache.hadoop.test.GenericTestUtils;
+import org.apache.hadoop.util.DataChecksum;
+import org.apache.hadoop.util.Time;
+import org.junit.Assert;
+import org.junit.Test;
+
+import com.google.common.base.Preconditions;
+import com.google.common.base.Supplier;
+
+import java.io.DataOutputStream;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+
+public class TestShortCircuitCache {
+  static final Log LOG = LogFactory.getLog(TestShortCircuitCache.class);
+  
+  private static class TestFileDescriptorPair {
+    TemporarySocketDirectory dir = new TemporarySocketDirectory();
+    FileInputStream fis[];
+
+    public TestFileDescriptorPair() throws IOException {
+      fis = new FileInputStream[2];
+      for (int i = 0; i < 2; i++) {
+        String name = dir.getDir() + "/file" + i;
+        FileOutputStream fos = new FileOutputStream(name);
+        if (i == 0) {
+          // write 'data' file
+          fos.write(1);
+        } else {
+          // write 'metadata' file
+          BlockMetadataHeader header =
+              new BlockMetadataHeader((short)1,
+                  DataChecksum.newDataChecksum(DataChecksum.Type.NULL, 4));
+          DataOutputStream dos = new DataOutputStream(fos);
+          BlockMetadataHeader.writeHeader(dos, header);
+          dos.close();
+        }
+        fos.close();
+        fis[i] = new FileInputStream(name);
+      }
+    }
+
+    public FileInputStream[] getFileInputStreams() {
+      return fis;
+    }
+
+    public void close() throws IOException {
+      IOUtils.cleanup(LOG, fis);
+      dir.close();
+    }
+
+    public boolean compareWith(FileInputStream data, FileInputStream meta) {
+      return ((data == fis[0]) && (meta == fis[1]));
+    }
+  }
+
+  private static class SimpleReplicaCreator
+      implements ShortCircuitReplicaCreator {
+    private final int blockId;
+    private final ShortCircuitCache cache;
+    private final TestFileDescriptorPair pair;
+
+    SimpleReplicaCreator(int blockId, ShortCircuitCache cache,
+        TestFileDescriptorPair pair) {
+      this.blockId = blockId;
+      this.cache = cache;
+      this.pair = pair;
+    }
+
+    @Override
+    public ShortCircuitReplicaInfo createShortCircuitReplicaInfo() {
+      try {
+        ExtendedBlockId key = new ExtendedBlockId(blockId, "test_bp1");
+        return new ShortCircuitReplicaInfo(
+            new ShortCircuitReplica(key,
+                pair.getFileInputStreams()[0], pair.getFileInputStreams()[1],
+                cache, Time.monotonicNow()));
+      } catch (IOException e) {
+        throw new RuntimeException(e);
+      }
+    }
+  }
+
+  @Test(timeout=60000)
+  public void testCreateAndDestroy() throws Exception {
+    ShortCircuitCache cache =
+        new ShortCircuitCache(10, 1, 10, 1, 1, 10000);
+    cache.close();
+  }
+  
+  @Test(timeout=60000)
+  public void testAddAndRetrieve() throws Exception {
+    final ShortCircuitCache cache =
+        new ShortCircuitCache(10, 10000000, 10, 10000000, 1, 10000);
+    final TestFileDescriptorPair pair = new TestFileDescriptorPair();
+    ShortCircuitReplicaInfo replicaInfo1 =
+      cache.fetchOrCreate(new ExtendedBlockId(123, "test_bp1"),
+        new SimpleReplicaCreator(123, cache, pair));
+    Preconditions.checkNotNull(replicaInfo1.getReplica());
+    Preconditions.checkState(replicaInfo1.getInvalidTokenException() == null);
+    pair.compareWith(replicaInfo1.getReplica().getDataStream(),
+                     replicaInfo1.getReplica().getMetaStream());
+    ShortCircuitReplicaInfo replicaInfo2 =
+      cache.fetchOrCreate(new ExtendedBlockId(123, "test_bp1"),
+          new ShortCircuitReplicaCreator() {
+        @Override
+        public ShortCircuitReplicaInfo createShortCircuitReplicaInfo() {
+          Assert.fail("expected to use existing entry.");
+          return null;
+        }
+      });
+    Preconditions.checkNotNull(replicaInfo2.getReplica());
+    Preconditions.checkState(replicaInfo2.getInvalidTokenException() == null);
+    Preconditions.checkState(replicaInfo1 == replicaInfo2);
+    pair.compareWith(replicaInfo2.getReplica().getDataStream(),
+                     replicaInfo2.getReplica().getMetaStream());
+    replicaInfo1.getReplica().unref();
+    replicaInfo2.getReplica().unref();
+    
+    // Even after the reference count falls to 0, we still keep the replica
+    // around for a while (we have configured the expiry period to be really,
+    // really long here)
+    ShortCircuitReplicaInfo replicaInfo3 =
+      cache.fetchOrCreate(
+          new ExtendedBlockId(123, "test_bp1"), new ShortCircuitReplicaCreator() {
+        @Override
+        public ShortCircuitReplicaInfo createShortCircuitReplicaInfo() {
+          Assert.fail("expected to use existing entry.");
+          return null;
+        }
+      });
+    Preconditions.checkNotNull(replicaInfo3.getReplica());
+    Preconditions.checkState(replicaInfo3.getInvalidTokenException() == null);
+    replicaInfo3.getReplica().unref();
+    
+    pair.close();
+    cache.close();
+  }
+
+  @Test(timeout=60000)
+  public void testExpiry() throws Exception {
+    final ShortCircuitCache cache =
+        new ShortCircuitCache(2, 1, 1, 10000000, 1, 10000);
+    final TestFileDescriptorPair pair = new TestFileDescriptorPair();
+    ShortCircuitReplicaInfo replicaInfo1 =
+      cache.fetchOrCreate(
+        new ExtendedBlockId(123, "test_bp1"), new SimpleReplicaCreator(123, cache, pair));
+    Preconditions.checkNotNull(replicaInfo1.getReplica());
+    Preconditions.checkState(replicaInfo1.getInvalidTokenException() == null);
+    pair.compareWith(replicaInfo1.getReplica().getDataStream(),
+                     replicaInfo1.getReplica().getMetaStream());
+    replicaInfo1.getReplica().unref();
+    final MutableBoolean triedToCreate = new MutableBoolean(false);
+    do {
+      Thread.sleep(10);
+      ShortCircuitReplicaInfo replicaInfo2 =
+        cache.fetchOrCreate(
+          new ExtendedBlockId(123, "test_bp1"), new ShortCircuitReplicaCreator() {
+          @Override
+          public ShortCircuitReplicaInfo createShortCircuitReplicaInfo() {
+            triedToCreate.setValue(true);
+            return null;
+          }
+        });
+      if ((replicaInfo2 != null) && (replicaInfo2.getReplica() != null)) {
+        replicaInfo2.getReplica().unref();
+      }
+    } while (triedToCreate.isFalse());
+    cache.close();
+  }
+  
+  
+  @Test(timeout=60000)
+  public void testEviction() throws Exception {
+    final ShortCircuitCache cache =
+        new ShortCircuitCache(2, 10000000, 1, 10000000, 1, 10000);
+    final TestFileDescriptorPair pairs[] = new TestFileDescriptorPair[] {
+      new TestFileDescriptorPair(),
+      new TestFileDescriptorPair(),
+      new TestFileDescriptorPair(),
+    };
+    ShortCircuitReplicaInfo replicaInfos[] = new ShortCircuitReplicaInfo[] {
+      null,
+      null,
+      null
+    };
+    for (int i = 0; i < pairs.length; i++) {
+      replicaInfos[i] = cache.fetchOrCreate(
+          new ExtendedBlockId(i, "test_bp1"), 
+            new SimpleReplicaCreator(i, cache, pairs[i]));
+      Preconditions.checkNotNull(replicaInfos[i].getReplica());
+      Preconditions.checkState(replicaInfos[i].getInvalidTokenException() == null);
+      pairs[i].compareWith(replicaInfos[i].getReplica().getDataStream(),
+                           replicaInfos[i].getReplica().getMetaStream());
+    }
+    // At this point, we have 3 replicas in use.
+    // Let's close them all.
+    for (int i = 0; i < pairs.length; i++) {
+      replicaInfos[i].getReplica().unref();
+    }
+    // The last two replicas should still be cached.
+    for (int i = 1; i < pairs.length; i++) {
+      final Integer iVal = new Integer(i);
+      replicaInfos[i] = cache.fetchOrCreate(
+          new ExtendedBlockId(i, "test_bp1"),
+            new ShortCircuitReplicaCreator() {
+        @Override
+        public ShortCircuitReplicaInfo createShortCircuitReplicaInfo() {
+          Assert.fail("expected to use existing entry for " + iVal);
+          return null;
+        }
+      });
+      Preconditions.checkNotNull(replicaInfos[i].getReplica());
+      Preconditions.checkState(replicaInfos[i].getInvalidTokenException() == null);
+      pairs[i].compareWith(replicaInfos[i].getReplica().getDataStream(),
+                           replicaInfos[i].getReplica().getMetaStream());
+    }
+    // The first (oldest) replica should not be cached.
+    final MutableBoolean calledCreate = new MutableBoolean(false);
+    replicaInfos[0] = cache.fetchOrCreate(
+        new ExtendedBlockId(0, "test_bp1"),
+          new ShortCircuitReplicaCreator() {
+        @Override
+        public ShortCircuitReplicaInfo createShortCircuitReplicaInfo() {
+          calledCreate.setValue(true);
+          return null;
+        }
+      });
+    Preconditions.checkState(replicaInfos[0].getReplica() == null);
+    Assert.assertTrue(calledCreate.isTrue());
+    // Clean up
+    for (int i = 1; i < pairs.length; i++) {
+      replicaInfos[i].getReplica().unref();
+    }
+    for (int i = 0; i < pairs.length; i++) {
+      pairs[i].close();
+    }
+    cache.close();
+  }
+  
+  @Test(timeout=60000)
+  public void testStaleness() throws Exception {
+    // Set up the cache with a short staleness time.
+    final ShortCircuitCache cache =
+        new ShortCircuitCache(2, 10000000, 1, 10000000, 1, 10);
+    final TestFileDescriptorPair pairs[] = new TestFileDescriptorPair[] {
+      new TestFileDescriptorPair(),
+      new TestFileDescriptorPair(),
+    };
+    ShortCircuitReplicaInfo replicaInfos[] = new ShortCircuitReplicaInfo[] {
+      null,
+      null
+    };
+    final long HOUR_IN_MS = 60 * 60 * 1000;
+    for (int i = 0; i < pairs.length; i++) {
+      final Integer iVal = new Integer(i);
+      final ExtendedBlockId key = new ExtendedBlockId(i, "test_bp1");
+      replicaInfos[i] = cache.fetchOrCreate(key,
+          new ShortCircuitReplicaCreator() {
+        @Override
+        public ShortCircuitReplicaInfo createShortCircuitReplicaInfo() {
+          try {
+            return new ShortCircuitReplicaInfo(
+                new ShortCircuitReplica(key,
+                    pairs[iVal].getFileInputStreams()[0],
+                    pairs[iVal].getFileInputStreams()[1],
+                    cache, Time.monotonicNow() + (iVal * HOUR_IN_MS)));
+          } catch (IOException e) {
+            throw new RuntimeException(e);
+          }
+        }
+      });
+      Preconditions.checkNotNull(replicaInfos[i].getReplica());
+      Preconditions.checkState(replicaInfos[i].getInvalidTokenException() == null);
+      pairs[i].compareWith(replicaInfos[i].getReplica().getDataStream(),
+                           replicaInfos[i].getReplica().getMetaStream());
+    }
+
+    // Keep trying to getOrCreate block 0 until it goes stale (and we must re-create.)
+    GenericTestUtils.waitFor(new Supplier<Boolean>() {
+      @Override
+      public Boolean get() {
+        ShortCircuitReplicaInfo info = cache.fetchOrCreate(
+          new ExtendedBlockId(0, "test_bp1"), new ShortCircuitReplicaCreator() {
+          @Override
+          public ShortCircuitReplicaInfo createShortCircuitReplicaInfo() {
+            return null;
+          }
+        });
+        if (info.getReplica() != null) {
+          info.getReplica().unref();
+          return false;
+        }
+        return true;
+      }
+    }, 500, 60000);
+
+    // Make sure that second replica did not go stale.
+    ShortCircuitReplicaInfo info = cache.fetchOrCreate(
+        new ExtendedBlockId(1, "test_bp1"), new ShortCircuitReplicaCreator() {
+      @Override
+      public ShortCircuitReplicaInfo createShortCircuitReplicaInfo() {
+        Assert.fail("second replica went stale, despite 1 " +
+            "hour staleness time.");
+        return null;
+      }
+    });
+    info.getReplica().unref();
+
+    // Clean up
+    for (int i = 1; i < pairs.length; i++) {
+      replicaInfos[i].getReplica().unref();
+    }
+    cache.close();
+  }
+}

+ 16 - 21
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestShortCircuitLocalRead.java

@@ -27,6 +27,7 @@ import java.io.RandomAccessFile;
 import java.net.URI;
 import java.nio.ByteBuffer;
 import java.security.PrivilegedExceptionAction;
+import java.util.UUID;
 import java.util.concurrent.TimeoutException;
 
 import org.apache.hadoop.conf.Configuration;
@@ -35,8 +36,9 @@ import org.apache.hadoop.fs.FSDataOutputStream;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.hdfs.client.HdfsDataInputStream;
+import org.apache.hadoop.hdfs.client.ShortCircuitCache;
+import org.apache.hadoop.hdfs.client.ShortCircuitReplica;
 import org.apache.hadoop.hdfs.protocol.ClientDatanodeProtocol;
-import org.apache.hadoop.hdfs.protocol.DatanodeID;
 import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
 import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
 import org.apache.hadoop.hdfs.protocol.LocatedBlocks;
@@ -125,8 +127,9 @@ public class TestShortCircuitLocalRead {
       throws IOException, InterruptedException {
     // Ensure short circuit is enabled
     DistributedFileSystem fs = getFileSystem(readingUser, uri, conf);
+    ClientContext getClientContext = ClientContext.getFromConf(conf);
     if (legacyShortCircuitFails) {
-      assertTrue(fs.getClient().useLegacyBlockReaderLocal());
+      assertFalse(getClientContext.getDisableLegacyBlockReaderLocal());
     }
     
     FSDataInputStream stm = fs.open(name);
@@ -155,7 +158,7 @@ public class TestShortCircuitLocalRead {
     checkData(actual, readOffset, expected, "Read 3");
     
     if (legacyShortCircuitFails) {
-      assertFalse(fs.getClient().useLegacyBlockReaderLocal());
+      assertTrue(getClientContext.getDisableLegacyBlockReaderLocal());
     }
     stm.close();
   }
@@ -175,8 +178,9 @@ public class TestShortCircuitLocalRead {
       throws IOException, InterruptedException {
     // Ensure short circuit is enabled
     DistributedFileSystem fs = getFileSystem(readingUser, uri, conf);
+    ClientContext clientContext = ClientContext.getFromConf(conf);
     if (legacyShortCircuitFails) {
-      assertTrue(fs.getClient().useLegacyBlockReaderLocal());
+      assertTrue(clientContext.getDisableLegacyBlockReaderLocal());
     }
     
     HdfsDataInputStream stm = (HdfsDataInputStream)fs.open(name);
@@ -209,7 +213,7 @@ public class TestShortCircuitLocalRead {
     }
     checkData(arrayFromByteBuffer(actual), readOffset, expected, "Read 3");
     if (legacyShortCircuitFails) {
-      assertFalse(fs.getClient().useLegacyBlockReaderLocal());
+      assertTrue(clientContext.getDisableLegacyBlockReaderLocal());
     }
     stm.close();
   }
@@ -223,7 +227,6 @@ public class TestShortCircuitLocalRead {
 
   public void doTestShortCircuitRead(boolean ignoreChecksum, int size,
       int readOffset) throws IOException, InterruptedException {
-    String shortCircuitUser = getCurrentUser();
     doTestShortCircuitReadImpl(ignoreChecksum, size, readOffset,
         null, getCurrentUser(), false);
   }
@@ -239,6 +242,10 @@ public class TestShortCircuitLocalRead {
     conf.setBoolean(DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_KEY, true);
     conf.setBoolean(DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_SKIP_CHECKSUM_KEY,
         ignoreChecksum);
+    // Set a random client context name so that we don't share a cache with
+    // other invocations of this function.
+    conf.set(DFSConfigKeys.DFS_CLIENT_CONTEXT,
+        UUID.randomUUID().toString());
     conf.set(DFSConfigKeys.DFS_DOMAIN_SOCKET_PATH_KEY,
         new File(sockDir.getDir(),
           "TestShortCircuitLocalRead._PORT.sock").getAbsolutePath());
@@ -322,18 +329,6 @@ public class TestShortCircuitLocalRead {
     doTestShortCircuitRead(true, 10*blockSize+100, 777);
   }
 
-  private ClientDatanodeProtocol getProxy(UserGroupInformation ugi,
-      final DatanodeID dnInfo, final Configuration conf) throws IOException,
-      InterruptedException {
-    return ugi.doAs(new PrivilegedExceptionAction<ClientDatanodeProtocol>() {
-      @Override
-      public ClientDatanodeProtocol run() throws Exception {
-        return DFSUtil.createClientDatanodeProtocolProxy(dnInfo, conf, 60000,
-            false);
-      }
-    });
-  }
-  
   private static DistributedFileSystem getFileSystem(String user, final URI uri,
       final Configuration conf) throws InterruptedException, IOException {
     UserGroupInformation ugi = UserGroupInformation.createRemoteUser(user);
@@ -555,8 +550,7 @@ public class TestShortCircuitLocalRead {
           for (int i = 0; i < iteration; i++) {
             try {
               String user = getCurrentUser();
-              checkFileContent(fs.getUri(), file1, dataToWrite, 0, user, conf,
-                  true);
+              checkFileContent(fs.getUri(), file1, dataToWrite, 0, user, conf, true);
             } catch (IOException e) {
               e.printStackTrace();
             } catch (InterruptedException e) {
@@ -608,7 +602,8 @@ public class TestShortCircuitLocalRead {
     stm.write(fileData);
     stm.close();
     try {
-      checkFileContent(uri, file1, fileData, readOffset, shortCircuitUser, conf, shortCircuitFails);
+      checkFileContent(uri, file1, fileData, readOffset, shortCircuitUser, 
+          conf, shortCircuitFails);
       //RemoteBlockReader have unsupported method read(ByteBuffer bf)
       assertTrue("RemoteBlockReader unsupported method read(ByteBuffer bf) error",
                     checkUnsupportedMethod(fs, file1, fileData, readOffset));

+ 64 - 31
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockTokenWithDFS.java

@@ -38,10 +38,16 @@ import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.hdfs.BlockReader;
 import org.apache.hadoop.hdfs.BlockReaderFactory;
+import org.apache.hadoop.hdfs.ClientContext;
 import org.apache.hadoop.hdfs.DFSClient;
 import org.apache.hadoop.hdfs.DFSConfigKeys;
 import org.apache.hadoop.hdfs.DFSTestUtil;
 import org.apache.hadoop.hdfs.MiniDFSCluster;
+import org.apache.hadoop.hdfs.DFSClient.Conf;
+import org.apache.hadoop.hdfs.RemotePeerFactory;
+import org.apache.hadoop.hdfs.client.ShortCircuitCache;
+import org.apache.hadoop.hdfs.client.ShortCircuitReplica;
+import org.apache.hadoop.hdfs.net.Peer;
 import org.apache.hadoop.hdfs.net.TcpPeerServer;
 import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
 import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
@@ -55,10 +61,13 @@ import org.apache.hadoop.hdfs.server.common.HdfsServerConstants;
 import org.apache.hadoop.hdfs.server.datanode.CachingStrategy;
 import org.apache.hadoop.hdfs.server.namenode.NameNode;
 import org.apache.hadoop.hdfs.server.protocol.NamenodeProtocols;
+import org.apache.hadoop.io.IOUtils;
 import org.apache.hadoop.net.NetUtils;
+import org.apache.hadoop.security.UserGroupInformation;
 import org.apache.hadoop.security.token.Token;
 import org.apache.hadoop.util.StringUtils;
 import org.apache.log4j.Level;
+import org.junit.Assert;
 import org.junit.Test;
 
 public class TestBlockTokenWithDFS {
@@ -131,50 +140,70 @@ public class TestBlockTokenWithDFS {
   }
 
   // try reading a block using a BlockReader directly
-  private static void tryRead(Configuration conf, LocatedBlock lblock,
+  private static void tryRead(final Configuration conf, LocatedBlock lblock,
       boolean shouldSucceed) {
     InetSocketAddress targetAddr = null;
-    Socket s = null;
+    IOException ioe = null;
     BlockReader blockReader = null;
     ExtendedBlock block = lblock.getBlock();
     try {
       DatanodeInfo[] nodes = lblock.getLocations();
       targetAddr = NetUtils.createSocketAddr(nodes[0].getXferAddr());
-      s = NetUtils.getDefaultSocketFactory(conf).createSocket();
-      s.connect(targetAddr, HdfsServerConstants.READ_TIMEOUT);
-      s.setSoTimeout(HdfsServerConstants.READ_TIMEOUT);
-
-      String file = BlockReaderFactory.getFileName(targetAddr, 
-          "test-blockpoolid", block.getBlockId());
-      blockReader = BlockReaderFactory.newBlockReader(
-          new DFSClient.Conf(conf), file, block, lblock.getBlockToken(), 0, -1,
-          true, "TestBlockTokenWithDFS", TcpPeerServer.peerFromSocket(s),
-          nodes[0], null, null, null, false,
-          CachingStrategy.newDefaultStrategy());
 
+      blockReader = new BlockReaderFactory(new DFSClient.Conf(conf)).
+          setFileName(BlockReaderFactory.getFileName(targetAddr, 
+                        "test-blockpoolid", block.getBlockId())).
+          setBlock(block).
+          setBlockToken(lblock.getBlockToken()).
+          setInetSocketAddress(targetAddr).
+          setStartOffset(0).
+          setLength(-1).
+          setVerifyChecksum(true).
+          setClientName("TestBlockTokenWithDFS").
+          setDatanodeInfo(nodes[0]).
+          setCachingStrategy(CachingStrategy.newDefaultStrategy()).
+          setClientCacheContext(ClientContext.getFromConf(conf)).
+          setConfiguration(conf).
+          setRemotePeerFactory(new RemotePeerFactory() {
+            @Override
+            public Peer newConnectedPeer(InetSocketAddress addr)
+                throws IOException {
+              Peer peer = null;
+              Socket sock = NetUtils.getDefaultSocketFactory(conf).createSocket();
+              try {
+                sock.connect(addr, HdfsServerConstants.READ_TIMEOUT);
+                sock.setSoTimeout(HdfsServerConstants.READ_TIMEOUT);
+                peer = TcpPeerServer.peerFromSocket(sock);
+              } finally {
+                if (peer == null) {
+                  IOUtils.closeSocket(sock);
+                }
+              }
+              return peer;
+            }
+          }).
+          build();
     } catch (IOException ex) {
-      if (ex instanceof InvalidBlockTokenException) {
-        assertFalse("OP_READ_BLOCK: access token is invalid, "
-            + "when it is expected to be valid", shouldSucceed);
-        return;
-      }
-      fail("OP_READ_BLOCK failed due to reasons other than access token: "
-          + StringUtils.stringifyException(ex));
+      ioe = ex;
     } finally {
-      if (s != null) {
+      if (blockReader != null) {
         try {
-          s.close();
-        } catch (IOException iex) {
-        } finally {
-          s = null;
+          blockReader.close();
+        } catch (IOException e) {
+          throw new RuntimeException(e);
         }
       }
     }
-    if (blockReader == null) {
-      fail("OP_READ_BLOCK failed due to reasons other than access token");
+    if (shouldSucceed) {
+      Assert.assertNotNull("OP_READ_BLOCK: access token is invalid, "
+            + "when it is expected to be valid", blockReader);
+    } else {
+      Assert.assertNotNull("OP_READ_BLOCK: access token is valid, "
+          + "when it is expected to be invalid", ioe);
+      Assert.assertTrue(
+          "OP_READ_BLOCK failed due to reasons other than access token: ",
+          ioe instanceof InvalidBlockTokenException);
     }
-    assertTrue("OP_READ_BLOCK: access token is valid, "
-        + "when it is expected to be invalid", shouldSucceed);
   }
 
   // get a conf for testing
@@ -347,9 +376,13 @@ public class TestBlockTokenWithDFS {
       /*
        * testing READ interface on DN using a BlockReader
        */
-
-      new DFSClient(new InetSocketAddress("localhost",
+      DFSClient client = null;
+      try {
+        client = new DFSClient(new InetSocketAddress("localhost",
           cluster.getNameNodePort()), conf);
+      } finally {
+        if (client != null) client.close();
+      }
       List<LocatedBlock> locatedBlocks = nnProto.getBlockLocations(
           FILE_TO_READ, 0, FILE_SIZE).getLocatedBlocks();
       LocatedBlock lblock = locatedBlocks.get(0); // first block

+ 161 - 0
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestReplicationPolicyConsiderLoad.java

@@ -0,0 +1,161 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs.server.blockmanagement;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.hdfs.DFSConfigKeys;
+import org.apache.hadoop.hdfs.DFSTestUtil;
+import org.apache.hadoop.hdfs.HdfsConfiguration;
+import org.apache.hadoop.hdfs.StorageType;
+import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
+import org.apache.hadoop.hdfs.protocol.HdfsConstants;
+import org.apache.hadoop.hdfs.security.token.block.ExportedBlockKeys;
+import org.apache.hadoop.hdfs.server.common.StorageInfo;
+import org.apache.hadoop.hdfs.server.namenode.NameNode;
+import org.apache.hadoop.hdfs.server.protocol.DatanodeRegistration;
+import org.apache.hadoop.test.PathUtils;
+import org.apache.hadoop.util.VersionInfo;
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+public class TestReplicationPolicyConsiderLoad {
+
+  private static NameNode namenode;
+  private static DatanodeManager dnManager;
+  private static List<DatanodeRegistration> dnrList;
+  private static DatanodeDescriptor[] dataNodes;
+  private static DatanodeStorageInfo[] storages;
+
+  @BeforeClass
+  public static void setupCluster() throws IOException {
+    Configuration conf = new HdfsConfiguration();
+    final String[] racks = {
+        "/rack1",
+        "/rack1",
+        "/rack1",
+        "/rack2",
+        "/rack2",
+        "/rack2"};
+    storages = DFSTestUtil.createDatanodeStorageInfos(racks);
+    dataNodes = DFSTestUtil.toDatanodeDescriptor(storages);
+    FileSystem.setDefaultUri(conf, "hdfs://localhost:0");
+    conf.set(DFSConfigKeys.DFS_NAMENODE_HTTP_ADDRESS_KEY, "0.0.0.0:0");
+    File baseDir = PathUtils.getTestDir(TestReplicationPolicy.class);
+    conf.set(DFSConfigKeys.DFS_NAMENODE_NAME_DIR_KEY,
+        new File(baseDir, "name").getPath());
+    conf.setBoolean(
+        DFSConfigKeys.DFS_NAMENODE_AVOID_STALE_DATANODE_FOR_READ_KEY, true);
+    conf.setBoolean(
+        DFSConfigKeys.DFS_NAMENODE_AVOID_STALE_DATANODE_FOR_WRITE_KEY, true);
+    conf.setBoolean(
+        DFSConfigKeys.DFS_NAMENODE_REPLICATION_CONSIDERLOAD_KEY, true);
+    DFSTestUtil.formatNameNode(conf);
+    namenode = new NameNode(conf);
+    int blockSize = 1024;
+
+    dnrList = new ArrayList<DatanodeRegistration>();
+    dnManager = namenode.getNamesystem().getBlockManager().getDatanodeManager();
+
+    // Register DNs
+    for (int i=0; i < 6; i++) {
+      DatanodeRegistration dnr = new DatanodeRegistration(dataNodes[i],
+          new StorageInfo(), new ExportedBlockKeys(), VersionInfo.getVersion());
+      dnrList.add(dnr);
+      dnManager.registerDatanode(dnr);
+      dataNodes[i].getStorageInfos()[0].setUtilizationForTesting(
+          2*HdfsConstants.MIN_BLOCKS_FOR_WRITE*blockSize, 0L,
+          2*HdfsConstants.MIN_BLOCKS_FOR_WRITE*blockSize, 0L);
+      dataNodes[i].updateHeartbeat(
+          BlockManagerTestUtil.getStorageReportsForDatanode(dataNodes[i]),
+          0L, 0L, 0, 0);
+    }
+  }
+
+  /**
+   * Tests that chooseTarget with considerLoad set to true correctly calculates
+   * load with decommissioned nodes.
+   */
+  @Test
+  public void testChooseTargetWithDecomNodes() throws IOException {
+    namenode.getNamesystem().writeLock();
+    try {
+      // Decommission DNs so BlockPlacementPolicyDefault.isGoodTarget()
+      // returns false
+      for (int i = 0; i < 3; i++) {
+        DatanodeInfo d = dnManager.getDatanodeByXferAddr(
+            dnrList.get(i).getIpAddr(),
+            dnrList.get(i).getXferPort());
+        d.setDecommissioned();
+      }
+      String blockPoolId = namenode.getNamesystem().getBlockPoolId();
+      dnManager.handleHeartbeat(dnrList.get(3),
+          BlockManagerTestUtil.getStorageReportsForDatanode(dataNodes[3]),
+          blockPoolId, dataNodes[3].getCacheCapacity(),
+          dataNodes[3].getCacheRemaining(),
+          2, 0, 0);
+      dnManager.handleHeartbeat(dnrList.get(4),
+          BlockManagerTestUtil.getStorageReportsForDatanode(dataNodes[4]),
+          blockPoolId, dataNodes[4].getCacheCapacity(),
+          dataNodes[4].getCacheRemaining(),
+          4, 0, 0);
+      dnManager.handleHeartbeat(dnrList.get(5),
+          BlockManagerTestUtil.getStorageReportsForDatanode(dataNodes[5]),
+          blockPoolId, dataNodes[5].getCacheCapacity(),
+          dataNodes[5].getCacheRemaining(),
+          4, 0, 0);
+
+      // Call chooseTarget()
+      DatanodeStorageInfo[] targets = namenode.getNamesystem().getBlockManager()
+          .getBlockPlacementPolicy().chooseTarget("testFile.txt", 3,
+              dataNodes[0], new ArrayList<DatanodeStorageInfo>(), false, null,
+              1024, StorageType.DEFAULT);
+
+      assertEquals(3, targets.length);
+      Set<DatanodeStorageInfo> targetSet = new HashSet<DatanodeStorageInfo>(
+          Arrays.asList(targets));
+      for (int i = 3; i < storages.length; i++) {
+        assertTrue(targetSet.contains(storages[i]));
+      }
+    } finally {
+      dataNodes[0].stopDecommission();
+      dataNodes[1].stopDecommission();
+      dataNodes[2].stopDecommission();
+      namenode.getNamesystem().writeUnlock();
+    }
+    NameNode.LOG.info("Done working on it");
+  }
+
+  @AfterClass
+  public static void teardownCluster() {
+    if (namenode != null) namenode.stop();
+  }
+
+}

+ 38 - 14
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeVolumeFailure.java

@@ -35,11 +35,14 @@ import org.apache.hadoop.fs.FileUtil;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.hdfs.BlockReader;
 import org.apache.hadoop.hdfs.BlockReaderFactory;
+import org.apache.hadoop.hdfs.ClientContext;
 import org.apache.hadoop.hdfs.DFSClient;
 import org.apache.hadoop.hdfs.DFSConfigKeys;
 import org.apache.hadoop.hdfs.DFSTestUtil;
 import org.apache.hadoop.hdfs.HdfsConfiguration;
 import org.apache.hadoop.hdfs.MiniDFSCluster;
+import org.apache.hadoop.hdfs.RemotePeerFactory;
+import org.apache.hadoop.hdfs.net.Peer;
 import org.apache.hadoop.hdfs.net.TcpPeerServer;
 import org.apache.hadoop.hdfs.protocol.Block;
 import org.apache.hadoop.hdfs.protocol.BlockListAsLongs;
@@ -48,13 +51,14 @@ import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
 import org.apache.hadoop.hdfs.protocol.LocatedBlock;
 import org.apache.hadoop.hdfs.server.blockmanagement.BlockManagerTestUtil;
 import org.apache.hadoop.hdfs.server.common.HdfsServerConstants;
-import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsVolumeSpi;
 import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
 import org.apache.hadoop.hdfs.server.protocol.DatanodeRegistration;
 import org.apache.hadoop.hdfs.server.protocol.DatanodeStorage;
 import org.apache.hadoop.hdfs.server.protocol.NamenodeProtocols;
 import org.apache.hadoop.hdfs.server.protocol.StorageBlockReport;
+import org.apache.hadoop.io.IOUtils;
 import org.apache.hadoop.net.NetUtils;
+import org.apache.hadoop.security.UserGroupInformation;
 import org.junit.After;
 import org.junit.Before;
 import org.junit.Test;
@@ -284,23 +288,43 @@ public class TestDataNodeVolumeFailure {
   private void accessBlock(DatanodeInfo datanode, LocatedBlock lblock)
     throws IOException {
     InetSocketAddress targetAddr = null;
-    Socket s = null;
     ExtendedBlock block = lblock.getBlock(); 
    
     targetAddr = NetUtils.createSocketAddr(datanode.getXferAddr());
-      
-    s = NetUtils.getDefaultSocketFactory(conf).createSocket();
-    s.connect(targetAddr, HdfsServerConstants.READ_TIMEOUT);
-    s.setSoTimeout(HdfsServerConstants.READ_TIMEOUT);
 
-    String file = BlockReaderFactory.getFileName(targetAddr, 
-        "test-blockpoolid",
-        block.getBlockId());
-    BlockReader blockReader =
-      BlockReaderFactory.newBlockReader(new DFSClient.Conf(conf), file, block,
-        lblock.getBlockToken(), 0, -1, true, "TestDataNodeVolumeFailure",
-        TcpPeerServer.peerFromSocket(s), datanode, null, null, null, false,
-        CachingStrategy.newDefaultStrategy());
+    BlockReader blockReader = new BlockReaderFactory(new DFSClient.Conf(conf)).
+      setInetSocketAddress(targetAddr).
+      setBlock(block).
+      setFileName(BlockReaderFactory.getFileName(targetAddr,
+                    "test-blockpoolid", block.getBlockId())).
+      setBlockToken(lblock.getBlockToken()).
+      setStartOffset(0).
+      setLength(-1).
+      setVerifyChecksum(true).
+      setClientName("TestDataNodeVolumeFailure").
+      setDatanodeInfo(datanode).
+      setCachingStrategy(CachingStrategy.newDefaultStrategy()).
+      setClientCacheContext(ClientContext.getFromConf(conf)).
+      setConfiguration(conf).
+      setRemotePeerFactory(new RemotePeerFactory() {
+        @Override
+        public Peer newConnectedPeer(InetSocketAddress addr)
+            throws IOException {
+          Peer peer = null;
+          Socket sock = NetUtils.getDefaultSocketFactory(conf).createSocket();
+          try {
+            sock.connect(addr, HdfsServerConstants.READ_TIMEOUT);
+            sock.setSoTimeout(HdfsServerConstants.READ_TIMEOUT);
+            peer = TcpPeerServer.peerFromSocket(sock);
+          } finally {
+            if (peer == null) {
+              IOUtils.closeSocket(sock);
+            }
+          }
+          return peer;
+        }
+      }).
+      build();
     blockReader.close();
   }
   

+ 36 - 0
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestDeduplicationMap.java

@@ -0,0 +1,36 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hdfs.server.namenode;
+
+import org.apache.hadoop.hdfs.server.namenode.FSImageFormatProtobuf.SaverContext.DeduplicationMap;
+import org.junit.Assert;
+import org.junit.Test;
+
+public class TestDeduplicationMap {
+  @Test
+  public void testDeduplicationMap() {
+    DeduplicationMap<String> m = DeduplicationMap.newMap();
+    Assert.assertEquals(1, m.getId("1"));
+    Assert.assertEquals(2, m.getId("2"));
+    Assert.assertEquals(3, m.getId("3"));
+    Assert.assertEquals(1, m.getId("1"));
+    Assert.assertEquals(2, m.getId("2"));
+    Assert.assertEquals(3, m.getId("3"));
+  }
+}

+ 138 - 0
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestFSImage.java

@@ -0,0 +1,138 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs.server.namenode;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.EnumSet;
+
+import junit.framework.Assert;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hdfs.DFSConfigKeys;
+import org.apache.hadoop.hdfs.DFSOutputStream;
+import org.apache.hadoop.hdfs.DistributedFileSystem;
+import org.apache.hadoop.hdfs.MiniDFSCluster;
+import org.apache.hadoop.hdfs.client.HdfsDataOutputStream.SyncFlag;
+import org.apache.hadoop.hdfs.protocol.HdfsConstants.SafeModeAction;
+import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfo;
+import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.BlockUCState;
+import org.apache.hadoop.hdfs.server.namenode.LeaseManager.Lease;
+import org.apache.hadoop.hdfs.util.MD5FileUtils;
+import org.junit.Test;
+
+public class TestFSImage {
+
+  @Test
+  public void testPersist() throws IOException {
+    Configuration conf = new Configuration();
+    testPersistHelper(conf);
+  }
+
+  @Test
+  public void testCompression() throws IOException {
+    Configuration conf = new Configuration();
+    conf.setBoolean(DFSConfigKeys.DFS_IMAGE_COMPRESS_KEY, true);
+    conf.set(DFSConfigKeys.DFS_IMAGE_COMPRESSION_CODEC_KEY,
+        "org.apache.hadoop.io.compress.GzipCodec");
+    testPersistHelper(conf);
+  }
+
+  private void testPersistHelper(Configuration conf) throws IOException {
+    MiniDFSCluster cluster = null;
+    try {
+      cluster = new MiniDFSCluster.Builder(conf).build();
+      cluster.waitActive();
+      FSNamesystem fsn = cluster.getNamesystem();
+      DistributedFileSystem fs = cluster.getFileSystem();
+
+      final Path dir = new Path("/abc/def");
+      final Path file1 = new Path(dir, "f1");
+      final Path file2 = new Path(dir, "f2");
+
+      // create an empty file f1
+      fs.create(file1).close();
+
+      // create an under-construction file f2
+      FSDataOutputStream out = fs.create(file2);
+      out.writeBytes("hello");
+      ((DFSOutputStream) out.getWrappedStream()).hsync(EnumSet
+          .of(SyncFlag.UPDATE_LENGTH));
+
+      // checkpoint
+      fs.setSafeMode(SafeModeAction.SAFEMODE_ENTER);
+      fs.saveNamespace();
+      fs.setSafeMode(SafeModeAction.SAFEMODE_LEAVE);
+
+      cluster.restartNameNode();
+      cluster.waitActive();
+      fs = cluster.getFileSystem();
+
+      assertTrue(fs.isDirectory(dir));
+      assertTrue(fs.exists(file1));
+      assertTrue(fs.exists(file2));
+
+      // check internals of file2
+      INodeFile file2Node = fsn.dir.getINode4Write(file2.toString()).asFile();
+      assertEquals("hello".length(), file2Node.computeFileSize());
+      assertTrue(file2Node.isUnderConstruction());
+      BlockInfo[] blks = file2Node.getBlocks();
+      assertEquals(1, blks.length);
+      assertEquals(BlockUCState.UNDER_CONSTRUCTION, blks[0].getBlockUCState());
+      // check lease manager
+      Lease lease = fsn.leaseManager.getLeaseByPath(file2.toString());
+      Assert.assertNotNull(lease);
+    } finally {
+      if (cluster != null) {
+        cluster.shutdown();
+      }
+    }
+  }
+
+  /**
+   * Ensure that the digest written by the saver equals to the digest of the
+   * file.
+   */
+  @Test
+  public void testDigest() throws IOException {
+    Configuration conf = new Configuration();
+    MiniDFSCluster cluster = null;
+    try {
+      cluster = new MiniDFSCluster.Builder(conf).numDataNodes(0).build();
+      DistributedFileSystem fs = cluster.getFileSystem();
+      fs.setSafeMode(SafeModeAction.SAFEMODE_ENTER);
+      fs.saveNamespace();
+      fs.setSafeMode(SafeModeAction.SAFEMODE_LEAVE);
+      File currentDir = FSImageTestUtil.getNameNodeCurrentDirs(cluster, 0).get(
+          0);
+      File fsimage = FSImageTestUtil.findNewestImageFile(currentDir
+          .getAbsolutePath());
+      assertEquals(MD5FileUtils.readStoredMd5ForFile(fsimage),
+          MD5FileUtils.computeMd5ForFile(fsimage));
+    } finally {
+      if (cluster != null) {
+        cluster.shutdown();
+      }
+    }
+  }
+}

+ 0 - 5
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestFSImageStorageInspector.java

@@ -27,17 +27,12 @@ import static org.junit.Assert.assertTrue;
 import java.io.File;
 import java.io.IOException;
 
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory;
 import org.apache.hadoop.hdfs.server.namenode.FSImageStorageInspector.FSImageFile;
 import org.apache.hadoop.hdfs.server.namenode.NNStorage.NameNodeDirType;
 import org.junit.Test;
 
 public class TestFSImageStorageInspector {
-  private static final Log LOG = LogFactory.getLog(
-      TestFSImageStorageInspector.class);
-
   /**
    * Simple test with image, edits, and inprogress edits
    */

Some files were not shown because too many files changed in this diff