Browse Source

Merge branch 'trunk' into HDFS-7240

Anu Engineer 7 years ago
parent
commit
0becabcefd
100 changed files with 3643 additions and 630 deletions
  1. 1 1
      hadoop-assemblies/src/main/resources/assemblies/hadoop-yarn-dist.xml
  2. 2 3
      hadoop-build-tools/src/main/resources/checkstyle/checkstyle.xml
  3. 1 1
      hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FileStatus.java
  4. 1 1
      hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FileSystem.java
  5. 0 39
      hadoop-common-project/hadoop-kms/src/test/java/org/apache/hadoop/crypto/key/kms/server/TestKMSWithZK.java
  6. 2 1
      hadoop-common-project/hadoop-minikdc/src/test/java/org/apache/hadoop/minikdc/TestMiniKdc.java
  7. 65 0
      hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/DistributedFileSystem.java
  8. 2 44
      hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/client/HdfsAdmin.java
  9. 27 1
      hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/web/WebHdfsFileSystem.java
  10. 1 0
      hadoop-hdfs-project/hadoop-hdfs-httpfs/src/main/java/org/apache/hadoop/fs/http/client/HttpFSFileSystem.java
  11. 18 2
      hadoop-hdfs-project/hadoop-hdfs-httpfs/src/main/java/org/apache/hadoop/fs/http/server/FSOperations.java
  12. 27 3
      hadoop-hdfs-project/hadoop-hdfs-httpfs/src/main/java/org/apache/hadoop/fs/http/server/HttpFSParametersProvider.java
  13. 13 5
      hadoop-hdfs-project/hadoop-hdfs-httpfs/src/main/java/org/apache/hadoop/fs/http/server/HttpFSServer.java
  14. 174 0
      hadoop-hdfs-project/hadoop-hdfs-httpfs/src/test/java/org/apache/hadoop/fs/http/server/TestHttpFSServer.java
  15. 8 2
      hadoop-hdfs-project/hadoop-hdfs-httpfs/src/test/java/org/apache/hadoop/fs/http/server/TestHttpFSServerWebServer.java
  16. 9 0
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java
  17. 2 1
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/NamenodeProtocolServerSideTranslatorPB.java
  18. 3 2
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/NamenodeProtocolTranslatorPB.java
  19. 1 1
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/balancer/Dispatcher.java
  20. 3 2
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/balancer/NameNodeConnector.java
  21. 4 13
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java
  22. 1 1
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNode.java
  23. 2 2
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DiskBalancer.java
  24. 2 1
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/web/webhdfs/WebHdfsHandler.java
  25. 16 4
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/diskbalancer/command/ExecuteCommand.java
  26. 13 5
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/federation/router/Router.java
  27. 4 3
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java
  28. 30 5
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeRpcServer.java
  29. 16 10
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/snapshot/AbstractINodeDiffList.java
  30. 131 91
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/snapshot/DiffListBySkipList.java
  31. 70 0
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/snapshot/DirectoryDiffListFactory.java
  32. 34 8
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/snapshot/DirectoryWithSnapshotFeature.java
  33. 12 3
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/snapshot/SnapshotManager.java
  34. 4 1
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/web/resources/NamenodeWebHdfsMethods.java
  35. 5 4
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/protocol/NamenodeProtocol.java
  36. 15 0
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/DiskBalancerCLI.java
  37. 1 0
      hadoop-hdfs-project/hadoop-hdfs/src/main/proto/NamenodeProtocol.proto
  38. 18 1
      hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml
  39. 5 0
      hadoop-hdfs-project/hadoop-hdfs/src/site/markdown/HDFSRouterFederation.md
  40. 223 0
      hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDFSInotifyEventInputStreamKerberized.java
  41. 44 0
      hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestEncryptionZones.java
  42. 40 10
      hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestGetBlocks.java
  43. 16 5
      hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/qjournal/MiniQJMHACluster.java
  44. 1 1
      hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/balancer/TestBalancer.java
  45. 17 4
      hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/diskbalancer/DiskBalancerTestUtil.java
  46. 64 8
      hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/diskbalancer/TestDiskBalancer.java
  47. 2 2
      hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/diskbalancer/TestDiskBalancerRPC.java
  48. 40 0
      hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/diskbalancer/command/TestDiskBalancerCommand.java
  49. 143 0
      hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/federation/router/TestRouterNamenodeMonitoring.java
  50. 110 17
      hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/snapshot/TestDiffListBySkipList.java
  51. 57 0
      hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/web/TestWebHdfsUrl.java
  52. 1 1
      hadoop-mapreduce-project/hadoop-mapreduce-examples/src/main/java/org/apache/hadoop/examples/pi/package.html
  53. 61 11
      hadoop-project/pom.xml
  54. 2 0
      hadoop-tools/hadoop-archive-logs/src/test/java/org/apache/hadoop/tools/TestHadoopArchiveLogs.java
  55. 36 0
      hadoop-tools/hadoop-aws/dev-support/findbugs-exclude.xml
  56. 163 0
      hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/FailureInjectionPolicy.java
  57. 67 110
      hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/InconsistentAmazonS3Client.java
  58. 232 0
      hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/InconsistentS3Object.java
  59. 3 2
      hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Invoker.java
  60. 35 9
      hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java
  61. 76 46
      hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AInputStream.java
  62. 87 0
      hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AOpContext.java
  63. 56 0
      hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AReadOpContext.java
  64. 36 9
      hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3ARetryPolicy.java
  65. 47 0
      hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3GuardExistsRetryPolicy.java
  66. 14 6
      hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/DynamoDBMetadataStore.java
  67. 3 0
      hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/MetadataStore.java
  68. 5 0
      hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/testing.md
  69. 106 10
      hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AInconsistency.java
  70. 4 9
      hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3GuardListConsistency.java
  71. 6 0
      hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/MockS3AFileSystem.java
  72. 2 2
      hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/S3ATestUtils.java
  73. 2 1
      hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/AbstractCommitITest.java
  74. 174 0
      hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/ITestDynamoDBMetadataStoreScale.java
  75. 1 1
      hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/MetadataStoreTestBase.java
  76. 0 48
      hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/scale/ITestDynamoDBMetadataStoreScale.java
  77. 58 0
      hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/scale/ITestS3AHugeFilesSSECDiskBlocks.java
  78. 1 1
      hadoop-tools/hadoop-openstack/src/test/java/org/apache/hadoop/fs/swift/TestSwiftFileSystemBlockLocation.java
  79. 336 0
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/AllocationTagNamespace.java
  80. 74 0
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/AllocationTagNamespaceType.java
  81. 50 0
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/AllocationTags.java
  82. 38 0
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/Evaluable.java
  83. 53 0
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/TargetApplications.java
  84. 17 9
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/resource/PlacementConstraints.java
  85. 34 0
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/exceptions/InvalidAllocationTagException.java
  86. 2 2
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/util/constraint/PlacementConstraintParser.java
  87. 10 0
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/pom.xml
  88. 41 19
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/logaggregation/filecontroller/ifile/LogAggregationIndexedFileController.java
  89. 21 1
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/util/Apps.java
  90. 1 1
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/util/AuxiliaryServiceHelper.java
  91. 54 0
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/test/java/org/apache/hadoop/yarn/logaggregation/filecontroller/ifile/TestLogAggregationIndexFileController.java
  92. 0 0
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/test/resources/application_123456_0001.har/_SUCCESS
  93. 3 0
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/test/resources/application_123456_0001.har/_index
  94. 2 0
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/test/resources/application_123456_0001.har/_masterindex
  95. BIN
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/test/resources/application_123456_0001.har/part-0
  96. 5 0
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/api/protocolrecords/RegisterNodeManagerRequest.java
  97. 79 0
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/api/protocolrecords/impl/pb/RegisterNodeManagerRequestPBImpl.java
  98. 1 0
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/proto/yarn_server_common_service_protos.proto
  99. 41 23
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/ContainerExecutor.java
  100. 3 1
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/Context.java

+ 1 - 1
hadoop-assemblies/src/main/resources/assemblies/hadoop-yarn-dist.xml

@@ -236,7 +236,7 @@
     </moduleSet>
     <moduleSet>
       <includes>
-        <include>org.apache.hadoop:hadoop-yarn-server-timelineservice-hbase-server</include>
+        <include>org.apache.hadoop:${hbase-server-artifactid}</include>
       </includes>
       <binaries>
         <outputDirectory>share/hadoop/${hadoop.component}/timelineservice</outputDirectory>

+ 2 - 3
hadoop-build-tools/src/main/resources/checkstyle/checkstyle.xml

@@ -50,8 +50,6 @@
 <module name="Checker">
 
     <module name="SuppressWarningsFilter"/>
-    <module name="SuppressionCommentFilter"/>
-    <module name="SuppressWithNearbyCommentFilter"/>
 
     <!-- Checks that a package.html file exists for each package.     -->
     <!-- See http://checkstyle.sf.net/config_javadoc.html#PackageHtml -->
@@ -72,7 +70,8 @@
     <module name="TreeWalker">
 
         <module name="SuppressWarningsHolder"/>
-        <module name="FileContentsHolder"/>
+        <module name="SuppressionCommentFilter"/>
+        <module name="SuppressWithNearbyCommentFilter"/>
 
 
         <!-- Checks for Javadoc comments.                     -->

+ 1 - 1
hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FileStatus.java

@@ -502,7 +502,7 @@ public class FileStatus implements Writable, Comparable<Object>,
     setPath(other.getPath());
     attr = attributes(other.hasAcl(), other.isEncrypted(),
         other.isErasureCoded(), other.isSnapshotEnabled());
-    assert (isDirectory() && getSymlink() == null) || !isDirectory();
+    assert !(isDirectory() && isSymlink()) : "A directory cannot be a symlink";
   }
 
   /**

+ 1 - 1
hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FileSystem.java

@@ -1649,7 +1649,7 @@ public abstract class FileSystem extends Configured implements Closeable {
   /**
    * Mark a path to be deleted when its FileSystem is closed.
    * When the JVM shuts down cleanly, all cached FileSystem objects will be
-   * closed automatically —these the marked paths will be deleted as a result.
+   * closed automatically. These the marked paths will be deleted as a result.
    *
    * If a FileSystem instance is not cached, i.e. has been created with
    * {@link #createFileSystem(URI, Configuration)}, then the paths will

+ 0 - 39
hadoop-common-project/hadoop-kms/src/test/java/org/apache/hadoop/crypto/key/kms/server/TestKMSWithZK.java

@@ -19,58 +19,19 @@ package org.apache.hadoop.crypto.key.kms.server;
 
 import org.apache.curator.test.TestingServer;
 import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.crypto.key.KeyProvider;
-import org.apache.hadoop.crypto.key.KeyProvider.KeyVersion;
-import org.apache.hadoop.crypto.key.KeyProvider.Options;
-import org.apache.hadoop.crypto.key.KeyProviderCryptoExtension;
-import org.apache.hadoop.crypto.key.KeyProviderCryptoExtension.EncryptedKeyVersion;
-import org.apache.hadoop.crypto.key.KeyProviderDelegationTokenExtension;
-import org.apache.hadoop.crypto.key.kms.KMSClientProvider;
 import org.apache.hadoop.crypto.key.kms.KMSRESTConstants;
 import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.minikdc.MiniKdc;
-import org.apache.hadoop.security.Credentials;
-import org.apache.hadoop.security.SecurityUtil;
 import org.apache.hadoop.security.UserGroupInformation;
 import org.apache.hadoop.security.authentication.server.AuthenticationFilter;
 import org.apache.hadoop.security.authentication.util.ZKSignerSecretProvider;
-import org.apache.hadoop.security.authorize.AuthorizationException;
-import org.apache.hadoop.security.ssl.KeyStoreTestUtil;
 import org.apache.hadoop.security.token.delegation.web.DelegationTokenAuthenticatedURL;
-import org.junit.AfterClass;
 import org.junit.Assert;
-import org.junit.Before;
-import org.junit.BeforeClass;
 import org.junit.Test;
 
-import javax.security.auth.Subject;
-import javax.security.auth.kerberos.KerberosPrincipal;
-import javax.security.auth.login.AppConfigurationEntry;
-import javax.security.auth.login.LoginContext;
 import java.io.File;
-import java.io.FileWriter;
-import java.io.IOException;
-import java.io.Writer;
 import java.net.HttpURLConnection;
-import java.net.InetAddress;
-import java.net.InetSocketAddress;
-import java.net.ServerSocket;
-import java.net.SocketTimeoutException;
-import java.net.URI;
 import java.net.URL;
-import java.security.Principal;
 import java.security.PrivilegedExceptionAction;
-import java.util.ArrayList;
-import java.util.Date;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Map;
-import java.util.Properties;
-import java.util.Set;
-import java.util.UUID;
-import java.util.concurrent.Callable;
 
 public class TestKMSWithZK {
 

+ 2 - 1
hadoop-common-project/hadoop-minikdc/src/test/java/org/apache/hadoop/minikdc/TestMiniKdc.java

@@ -165,7 +165,8 @@ public class TestMiniKdc extends KerberosSecurityTestcase {
       loginContext.logout();
 
     } finally {
-      if (loginContext != null) {
+      if (loginContext != null && loginContext.getSubject() != null
+          && !loginContext.getSubject().getPrincipals().isEmpty()) {
         loginContext.logout();
       }
     }

+ 65 - 0
hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/DistributedFileSystem.java

@@ -37,6 +37,7 @@ import org.apache.hadoop.fs.FSDataInputStream;
 import org.apache.hadoop.fs.FSDataOutputStream;
 import org.apache.hadoop.fs.FSDataOutputStreamBuilder;
 import org.apache.hadoop.fs.FSLinkResolver;
+import org.apache.hadoop.fs.FileAlreadyExistsException;
 import org.apache.hadoop.fs.FileChecksum;
 import org.apache.hadoop.fs.FileEncryptionInfo;
 import org.apache.hadoop.fs.FileStatus;
@@ -2600,6 +2601,70 @@ public class DistributedFileSystem extends FileSystem
     }.resolve(this, absF);
   }
 
+  /* HDFS only */
+  public void provisionEZTrash(final Path path,
+      final FsPermission trashPermission) throws IOException {
+    Path absF = fixRelativePart(path);
+    new FileSystemLinkResolver<Void>() {
+      @Override
+      public Void doCall(Path p) throws IOException {
+        provisionEZTrash(getPathName(p), trashPermission);
+        return null;
+      }
+
+      @Override
+      public Void next(FileSystem fs, Path p) throws IOException {
+        if (fs instanceof DistributedFileSystem) {
+          DistributedFileSystem myDfs = (DistributedFileSystem)fs;
+          myDfs.provisionEZTrash(p, trashPermission);
+          return null;
+        }
+        throw new UnsupportedOperationException("Cannot provisionEZTrash " +
+            "through a symlink to a non-DistributedFileSystem: " + fs + " -> "
+            + p);
+      }
+    }.resolve(this, absF);
+  }
+
+  private void provisionEZTrash(String path, FsPermission trashPermission)
+      throws IOException {
+    // make sure the path is an EZ
+    EncryptionZone ez = dfs.getEZForPath(path);
+    if (ez == null) {
+      throw new IllegalArgumentException(path + " is not an encryption zone.");
+    }
+
+    String ezPath = ez.getPath();
+    if (!path.toString().equals(ezPath)) {
+      throw new IllegalArgumentException(path + " is not the root of an " +
+          "encryption zone. Do you mean " + ez.getPath() + "?");
+    }
+
+    // check if the trash directory exists
+    Path trashPath = new Path(ez.getPath(), FileSystem.TRASH_PREFIX);
+    try {
+      FileStatus trashFileStatus = getFileStatus(trashPath);
+      String errMessage = "Will not provision new trash directory for " +
+          "encryption zone " + ez.getPath() + ". Path already exists.";
+      if (!trashFileStatus.isDirectory()) {
+        errMessage += "\r\n" +
+            "Warning: " + trashPath.toString() + " is not a directory";
+      }
+      if (!trashFileStatus.getPermission().equals(trashPermission)) {
+        errMessage += "\r\n" +
+            "Warning: the permission of " +
+            trashPath.toString() + " is not " + trashPermission;
+      }
+      throw new FileAlreadyExistsException(errMessage);
+    } catch (FileNotFoundException ignored) {
+      // no trash path
+    }
+
+    // Update the permission bits
+    mkdir(trashPath, trashPermission);
+    setPermission(trashPath, trashPermission);
+  }
+
   @Override
   public void setXAttr(Path path, final String name, final byte[] value,
       final EnumSet<XAttrSetFlag> flag) throws IOException {

+ 2 - 44
hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/client/HdfsAdmin.java

@@ -24,9 +24,7 @@ import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.crypto.key.KeyProvider;
 import org.apache.hadoop.fs.BlockStoragePolicySpi;
 import org.apache.hadoop.fs.CacheFlag;
-import org.apache.hadoop.fs.FileAlreadyExistsException;
 import org.apache.hadoop.fs.FileEncryptionInfo;
-import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.RemoteIterator;
@@ -330,7 +328,7 @@ public class HdfsAdmin {
         throw new HadoopIllegalArgumentException(
             "can not have both PROVISION_TRASH and NO_TRASH flags");
       }
-      this.provisionEZTrash(path);
+      dfs.provisionEZTrash(path, TRASH_PERMISSION);
     }
   }
 
@@ -341,7 +339,7 @@ public class HdfsAdmin {
    * @throws IOException if the trash directory can not be created.
    */
   public void provisionEncryptionZoneTrash(Path path) throws IOException {
-    this.provisionEZTrash(path);
+    dfs.provisionEZTrash(path, TRASH_PERMISSION);
   }
 
   /**
@@ -603,46 +601,6 @@ public class HdfsAdmin {
     dfs.disableErasureCodingPolicy(ecPolicyName);
   }
 
-  private void provisionEZTrash(Path path) throws IOException {
-    // make sure the path is an EZ
-    EncryptionZone ez = dfs.getEZForPath(path);
-    if (ez == null) {
-      throw new IllegalArgumentException(path + " is not an encryption zone.");
-    }
-
-    String ezPath = ez.getPath();
-    if (!path.toString().equals(ezPath)) {
-      throw new IllegalArgumentException(path + " is not the root of an " +
-          "encryption zone. Do you mean " + ez.getPath() + "?");
-    }
-
-    // check if the trash directory exists
-
-    Path trashPath = new Path(ez.getPath(), FileSystem.TRASH_PREFIX);
-
-    try {
-      FileStatus trashFileStatus = dfs.getFileStatus(trashPath);
-      String errMessage = "Will not provision new trash directory for " +
-          "encryption zone " + ez.getPath() + ". Path already exists.";
-      if (!trashFileStatus.isDirectory()) {
-        errMessage += "\r\n" +
-            "Warning: " + trashPath.toString() + " is not a directory";
-      }
-      if (!trashFileStatus.getPermission().equals(TRASH_PERMISSION)) {
-        errMessage += "\r\n" +
-            "Warning: the permission of " +
-            trashPath.toString() + " is not " + TRASH_PERMISSION;
-      }
-      throw new FileAlreadyExistsException(errMessage);
-    } catch (FileNotFoundException ignored) {
-      // no trash path
-    }
-
-    // Update the permission bits
-    dfs.mkdir(trashPath, TRASH_PERMISSION);
-    dfs.setPermission(trashPath, TRASH_PERMISSION);
-  }
-
   /**
    * Returns a RemoteIterator which can be used to list all open files
    * currently managed by the NameNode. For large numbers of open files,

+ 27 - 1
hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/web/WebHdfsFileSystem.java

@@ -37,6 +37,8 @@ import java.net.InetSocketAddress;
 import java.net.MalformedURLException;
 import java.net.URI;
 import java.net.URL;
+import java.net.URLDecoder;
+import java.net.URLEncoder;
 import java.nio.charset.StandardCharsets;
 import java.security.PrivilegedExceptionAction;
 import java.util.ArrayList;
@@ -598,8 +600,32 @@ public class WebHdfsFileSystem extends FileSystem
   URL toUrl(final HttpOpParam.Op op, final Path fspath,
       final Param<?,?>... parameters) throws IOException {
     //initialize URI path and query
+
+    Path encodedFSPath = fspath;
+    if (fspath != null) {
+      URI fspathUri = fspath.toUri();
+      String fspathUriDecoded = fspathUri.getPath();
+      try {
+        fspathUriDecoded = URLDecoder.decode(fspathUri.getPath(), "UTF-8");
+      } catch (IllegalArgumentException ex) {
+        LOG.trace("Cannot decode URL encoded file", ex);
+      }
+      String[] fspathItems = fspathUriDecoded.split("/");
+
+      if (fspathItems.length > 0) {
+        StringBuilder fsPathEncodedItems = new StringBuilder();
+        for (String fsPathItem : fspathItems) {
+          fsPathEncodedItems.append("/");
+          fsPathEncodedItems.append(URLEncoder.encode(fsPathItem, "UTF-8"));
+        }
+        encodedFSPath = new Path(fspathUri.getScheme(),
+                fspathUri.getAuthority(), fsPathEncodedItems.substring(1));
+      }
+    }
+
     final String path = PATH_PREFIX
-        + (fspath == null? "/": makeQualified(fspath).toUri().getRawPath());
+        + (encodedFSPath == null ? "/" :
+            makeQualified(encodedFSPath).toUri().getRawPath());
     final String query = op.toQueryString()
         + Param.toSortedString("&", getAuthParameters(op))
         + Param.toSortedString("&", parameters);

+ 1 - 0
hadoop-hdfs-project/hadoop-hdfs-httpfs/src/main/java/org/apache/hadoop/fs/http/client/HttpFSFileSystem.java

@@ -104,6 +104,7 @@ public class HttpFSFileSystem extends FileSystem
   public static final String REPLICATION_PARAM = "replication";
   public static final String BLOCKSIZE_PARAM = "blocksize";
   public static final String PERMISSION_PARAM = "permission";
+  public static final String UNMASKED_PERMISSION_PARAM = "unmaskedpermission";
   public static final String ACLSPEC_PARAM = "aclspec";
   public static final String DESTINATION_PARAM = "destination";
   public static final String RECURSIVE_PARAM = "recursive";

+ 18 - 2
hadoop-hdfs-project/hadoop-hdfs-httpfs/src/main/java/org/apache/hadoop/fs/http/server/FSOperations.java

@@ -40,6 +40,7 @@ import org.apache.hadoop.lib.service.FileSystemAccess;
 import org.apache.hadoop.util.StringUtils;
 import org.json.simple.JSONArray;
 import org.json.simple.JSONObject;
+import org.apache.hadoop.fs.permission.FsCreateModes;
 
 import java.io.FileNotFoundException;
 import java.io.IOException;
@@ -473,6 +474,7 @@ public class FSOperations {
     private InputStream is;
     private Path path;
     private short permission;
+    private short unmaskedPermission;
     private boolean override;
     private short replication;
     private long blockSize;
@@ -486,12 +488,14 @@ public class FSOperations {
      * @param override if the file should be overriden if it already exist.
      * @param repl the replication factor for the file.
      * @param blockSize the block size for the file.
+     * @param unmaskedPerm unmasked permissions for the file
      */
     public FSCreate(InputStream is, String path, short perm, boolean override,
-                    short repl, long blockSize) {
+                    short repl, long blockSize, short unmaskedPerm) {
       this.is = is;
       this.path = new Path(path);
       this.permission = perm;
+      this.unmaskedPermission = unmaskedPerm;
       this.override = override;
       this.replication = repl;
       this.blockSize = blockSize;
@@ -515,6 +519,10 @@ public class FSOperations {
         blockSize = fs.getDefaultBlockSize(path);
       }
       FsPermission fsPermission = new FsPermission(permission);
+      if (unmaskedPermission != -1) {
+        fsPermission = FsCreateModes.create(fsPermission,
+            new FsPermission(unmaskedPermission));
+      }
       int bufferSize = fs.getConf().getInt(HTTPFS_BUFFER_SIZE_KEY,
           HTTP_BUFFER_SIZE_DEFAULT);
       OutputStream os = fs.create(path, fsPermission, override, bufferSize, replication, blockSize, null);
@@ -748,16 +756,20 @@ public class FSOperations {
 
     private Path path;
     private short permission;
+    private short unmaskedPermission;
 
     /**
      * Creates a mkdirs executor.
      *
      * @param path directory path to create.
      * @param permission permission to use.
+     * @param unmaskedPermission unmasked permissions for the directory
      */
-    public FSMkdirs(String path, short permission) {
+    public FSMkdirs(String path, short permission,
+        short unmaskedPermission) {
       this.path = new Path(path);
       this.permission = permission;
+      this.unmaskedPermission = unmaskedPermission;
     }
 
     /**
@@ -773,6 +785,10 @@ public class FSOperations {
     @Override
     public JSONObject execute(FileSystem fs) throws IOException {
       FsPermission fsPermission = new FsPermission(permission);
+      if (unmaskedPermission != -1) {
+        fsPermission = FsCreateModes.create(fsPermission,
+            new FsPermission(unmaskedPermission));
+      }
       boolean mkdirs = fs.mkdirs(path, fsPermission);
       return toJSON(HttpFSFileSystem.MKDIRS_JSON, mkdirs);
     }

+ 27 - 3
hadoop-hdfs-project/hadoop-hdfs-httpfs/src/main/java/org/apache/hadoop/fs/http/server/HttpFSParametersProvider.java

@@ -66,9 +66,11 @@ public class HttpFSParametersProvider extends ParametersProvider {
     PARAMS_DEF.put(Operation.CONCAT, new Class[]{SourcesParam.class});
     PARAMS_DEF.put(Operation.TRUNCATE, new Class[]{NewLengthParam.class});
     PARAMS_DEF.put(Operation.CREATE,
-      new Class[]{PermissionParam.class, OverwriteParam.class,
-                  ReplicationParam.class, BlockSizeParam.class, DataParam.class});
-    PARAMS_DEF.put(Operation.MKDIRS, new Class[]{PermissionParam.class});
+        new Class[]{PermissionParam.class, OverwriteParam.class,
+            ReplicationParam.class, BlockSizeParam.class, DataParam.class,
+            UnmaskedPermissionParam.class});
+    PARAMS_DEF.put(Operation.MKDIRS, new Class[]{PermissionParam.class,
+        UnmaskedPermissionParam.class});
     PARAMS_DEF.put(Operation.RENAME, new Class[]{DestinationParam.class});
     PARAMS_DEF.put(Operation.SETOWNER,
         new Class[]{OwnerParam.class, GroupParam.class});
@@ -384,6 +386,28 @@ public class HttpFSParametersProvider extends ParametersProvider {
 
   }
 
+  /**
+   * Class for unmaskedpermission parameter.
+   */
+  @InterfaceAudience.Private
+  public static class UnmaskedPermissionParam extends ShortParam {
+
+    /**
+     * Parameter name.
+     */
+    public static final String NAME =
+        HttpFSFileSystem.UNMASKED_PERMISSION_PARAM;
+
+
+    /**
+     * Constructor.
+     */
+    public UnmaskedPermissionParam() {
+      super(NAME, (short) -1, 8);
+    }
+
+  }
+
   /**
    * Class for AclPermission parameter.
    */

+ 13 - 5
hadoop-hdfs-project/hadoop-hdfs-httpfs/src/main/java/org/apache/hadoop/fs/http/server/HttpFSServer.java

@@ -46,6 +46,7 @@ import org.apache.hadoop.fs.http.server.HttpFSParametersProvider.PolicyNameParam
 import org.apache.hadoop.fs.http.server.HttpFSParametersProvider.RecursiveParam;
 import org.apache.hadoop.fs.http.server.HttpFSParametersProvider.ReplicationParam;
 import org.apache.hadoop.fs.http.server.HttpFSParametersProvider.SourcesParam;
+import org.apache.hadoop.fs.http.server.HttpFSParametersProvider.UnmaskedPermissionParam;
 import org.apache.hadoop.fs.http.server.HttpFSParametersProvider.SnapshotNameParam;
 import org.apache.hadoop.fs.http.server.HttpFSParametersProvider.XAttrEncodingParam;
 import org.apache.hadoop.fs.http.server.HttpFSParametersProvider.XAttrNameParam;
@@ -578,6 +579,8 @@ public class HttpFSServer {
         } else {
           Short permission = params.get(PermissionParam.NAME,
                                          PermissionParam.class);
+          Short unmaskedPermission = params.get(UnmaskedPermissionParam.NAME,
+              UnmaskedPermissionParam.class);
           Boolean override = params.get(OverwriteParam.NAME,
                                         OverwriteParam.class);
           Short replication = params.get(ReplicationParam.NAME,
@@ -586,11 +589,13 @@ public class HttpFSServer {
                                       BlockSizeParam.class);
           FSOperations.FSCreate command =
             new FSOperations.FSCreate(is, path, permission, override,
-                                      replication, blockSize);
+                replication, blockSize, unmaskedPermission);
           fsExecute(user, command);
           AUDIT_LOG.info(
-            "[{}] permission [{}] override [{}] replication [{}] blockSize [{}]",
-            new Object[]{path, permission, override, replication, blockSize});
+              "[{}] permission [{}] override [{}] "+
+              "replication [{}] blockSize [{}] unmaskedpermission [{}]",
+              new Object[]{path, permission,  override, replication, blockSize,
+                  unmaskedPermission});
           response = Response.status(Response.Status.CREATED).build();
         }
         break;
@@ -646,10 +651,13 @@ public class HttpFSServer {
       case MKDIRS: {
         Short permission = params.get(PermissionParam.NAME,
                                        PermissionParam.class);
+        Short unmaskedPermission = params.get(UnmaskedPermissionParam.NAME,
+            UnmaskedPermissionParam.class);
         FSOperations.FSMkdirs command =
-          new FSOperations.FSMkdirs(path, permission);
+            new FSOperations.FSMkdirs(path, permission, unmaskedPermission);
         JSONObject json = fsExecute(user, command);
-        AUDIT_LOG.info("[{}] permission [{}]", path, permission);
+        AUDIT_LOG.info("[{}] permission [{}] unmaskedpermission [{}]",
+            path, permission, unmaskedPermission);
         response = Response.ok(json).type(MediaType.APPLICATION_JSON).build();
         break;
       }

+ 174 - 0
hadoop-hdfs-project/hadoop-hdfs-httpfs/src/test/java/org/apache/hadoop/fs/http/server/TestHttpFSServer.java

@@ -41,6 +41,7 @@ import java.net.URL;
 import java.text.MessageFormat;
 import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
 
@@ -48,6 +49,11 @@ import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.XAttrCodec;
+import org.apache.hadoop.fs.permission.AclEntry;
+import org.apache.hadoop.fs.permission.AclEntryScope;
+import org.apache.hadoop.fs.permission.AclEntryType;
+import org.apache.hadoop.fs.permission.AclStatus;
+import org.apache.hadoop.fs.permission.FsAction;
 import org.apache.hadoop.hdfs.web.WebHdfsConstants;
 import org.apache.hadoop.lib.server.Service;
 import org.apache.hadoop.lib.server.ServiceException;
@@ -406,6 +412,19 @@ public class TestHttpFSServer extends HFSTestCase {
    * @throws Exception
    */
   private void createWithHttp(String filename, String perms) throws Exception {
+    createWithHttp(filename, perms, null);
+  }
+
+  /**
+   * Talks to the http interface to create a file.
+   *
+   * @param filename The file to create
+   * @param perms The permission field, if any (may be null)
+   * @param unmaskedPerms The unmaskedPermission field, if any (may be null)
+   * @throws Exception
+   */
+  private void createWithHttp(String filename, String perms,
+      String unmaskedPerms) throws Exception {
     String user = HadoopUsersConfTestHelper.getHadoopUsers()[0];
     // Remove leading / from filename
     if (filename.charAt(0) == '/') {
@@ -421,6 +440,9 @@ public class TestHttpFSServer extends HFSTestCase {
               "/webhdfs/v1/{0}?user.name={1}&permission={2}&op=CREATE",
               filename, user, perms);
     }
+    if (unmaskedPerms != null) {
+      pathOps = pathOps+"&unmaskedpermission="+unmaskedPerms;
+    }
     URL url = new URL(TestJettyHelper.getJettyURL(), pathOps);
     HttpURLConnection conn = (HttpURLConnection) url.openConnection();
     conn.addRequestProperty("Content-Type", "application/octet-stream");
@@ -429,6 +451,41 @@ public class TestHttpFSServer extends HFSTestCase {
     Assert.assertEquals(HttpURLConnection.HTTP_CREATED, conn.getResponseCode());
   }
 
+  /**
+   * Talks to the http interface to create a directory.
+   *
+   * @param dirname The directory to create
+   * @param perms The permission field, if any (may be null)
+   * @param unmaskedPerms The unmaskedPermission field, if any (may be null)
+   * @throws Exception
+   */
+  private void createDirWithHttp(String dirname, String perms,
+      String unmaskedPerms) throws Exception {
+    String user = HadoopUsersConfTestHelper.getHadoopUsers()[0];
+    // Remove leading / from filename
+    if (dirname.charAt(0) == '/') {
+      dirname = dirname.substring(1);
+    }
+    String pathOps;
+    if (perms == null) {
+      pathOps = MessageFormat.format(
+              "/webhdfs/v1/{0}?user.name={1}&op=MKDIRS",
+              dirname, user);
+    } else {
+      pathOps = MessageFormat.format(
+              "/webhdfs/v1/{0}?user.name={1}&permission={2}&op=MKDIRS",
+              dirname, user, perms);
+    }
+    if (unmaskedPerms != null) {
+      pathOps = pathOps+"&unmaskedpermission="+unmaskedPerms;
+    }
+    URL url = new URL(TestJettyHelper.getJettyURL(), pathOps);
+    HttpURLConnection conn = (HttpURLConnection) url.openConnection();
+    conn.setRequestMethod("PUT");
+    conn.connect();
+    Assert.assertEquals(HttpURLConnection.HTTP_OK, conn.getResponseCode());
+  }
+
   /**
    * Talks to the http interface to get the json output of a *STATUS command
    * on the given file.
@@ -577,6 +634,27 @@ public class TestHttpFSServer extends HFSTestCase {
     }
   }
 
+  /**
+   *
+   * @param stat AclStatus object from a call to getAclStatus
+   * @param name The name of the ACL being searched for
+   * @return The AclEntry if found, or null otherwise
+   * @throws IOException
+   */
+  private AclEntry findAclWithName(AclStatus stat, String name)
+      throws IOException{
+    AclEntry relevantAcl = null;
+    Iterator<AclEntry> it = stat.getEntries().iterator();
+    while (it.hasNext()) {
+      AclEntry e = it.next();
+      if (e.getName().equals(name)) {
+        relevantAcl = e;
+        break;
+      }
+    }
+    return relevantAcl;
+  }
+
   /**
    * Validate that files are created with 755 permissions when no
    * 'permissions' attribute is specified, and when 'permissions'
@@ -837,6 +915,102 @@ public class TestHttpFSServer extends HFSTestCase {
     Assert.assertEquals(-1, is.read());
   }
 
+  @Test
+  @TestDir
+  @TestJetty
+  @TestHdfs
+  public void testCreateFileWithUnmaskedPermissions() throws Exception {
+    createHttpFSServer(false, false);
+
+    FileSystem fs = FileSystem.get(TestHdfsHelper.getHdfsConf());
+    // Create a folder with a default acl default:user2:rw-
+    fs.mkdirs(new Path("/tmp"));
+    AclEntry acl = new org.apache.hadoop.fs.permission.AclEntry.Builder()
+        .setType(AclEntryType.USER)
+        .setScope(AclEntryScope.DEFAULT)
+        .setName("user2")
+        .setPermission(FsAction.READ_WRITE)
+        .build();
+    fs.setAcl(new Path("/tmp"), new ArrayList<AclEntry>(Arrays.asList(acl)));
+
+    String notUnmaskedFile = "/tmp/notUnmasked";
+    String unmaskedFile = "/tmp/unmasked";
+
+    // Create a file inside the folder. It should inherit the default acl
+    // but the mask should affect the ACL permissions. The mask is controlled
+    // by the group permissions, which are 0, and hence the mask will make
+    // the effective permission of the inherited ACL be NONE.
+    createWithHttp(notUnmaskedFile, "700");
+
+    // Pull the relevant ACL from the FS object and check the mask has affected
+    // its permissions.
+    AclStatus aclStatus = fs.getAclStatus(new Path(notUnmaskedFile));
+    AclEntry theAcl = findAclWithName(aclStatus, "user2");
+
+    Assert.assertNotNull(theAcl);
+    Assert.assertEquals(FsAction.NONE,
+        aclStatus.getEffectivePermission(theAcl));
+
+    // Create another file, this time pass a mask of 777. Now the inherited
+    // permissions should be as expected
+    createWithHttp(unmaskedFile, "700", "777");
+
+    aclStatus = fs.getAclStatus(new Path(unmaskedFile));
+    theAcl = findAclWithName(aclStatus, "user2");
+
+    Assert.assertNotNull(theAcl);
+    Assert.assertEquals(FsAction.READ_WRITE,
+        aclStatus.getEffectivePermission(theAcl));
+  }
+
+  @Test
+  @TestDir
+  @TestJetty
+  @TestHdfs
+  public void testMkdirWithUnmaskedPermissions() throws Exception {
+    createHttpFSServer(false, false);
+
+    FileSystem fs = FileSystem.get(TestHdfsHelper.getHdfsConf());
+    // Create a folder with a default acl default:user2:rw-
+    fs.mkdirs(new Path("/tmp"));
+    AclEntry acl = new org.apache.hadoop.fs.permission.AclEntry.Builder()
+        .setType(AclEntryType.USER)
+        .setScope(AclEntryScope.DEFAULT)
+        .setName("user2")
+        .setPermission(FsAction.READ_WRITE)
+        .build();
+    fs.setAcl(new Path("/tmp"), new ArrayList<AclEntry>(Arrays.asList(acl)));
+
+    String notUnmaskedDir = "/tmp/notUnmaskedDir";
+    String unmaskedDir = "/tmp/unmaskedDir";
+
+    // Create a file inside the folder. It should inherit the default acl
+    // but the mask should affect the ACL permissions. The mask is controlled
+    // by the group permissions, which are 0, and hence the mask will make
+    // the effective permission of the inherited ACL be NONE.
+    createDirWithHttp(notUnmaskedDir, "700", null);
+
+    // Pull the relevant ACL from the FS object and check the mask has affected
+    // its permissions.
+    AclStatus aclStatus = fs.getAclStatus(new Path(notUnmaskedDir));
+    AclEntry theAcl = findAclWithName(aclStatus, "user2");
+
+    Assert.assertNotNull(theAcl);
+    Assert.assertEquals(FsAction.NONE,
+        aclStatus.getEffectivePermission(theAcl));
+
+    // Create another file, this time pass a mask of 777. Now the inherited
+    // permissions should be as expected
+    createDirWithHttp(unmaskedDir, "700", "777");
+
+    aclStatus = fs.getAclStatus(new Path(unmaskedDir));
+    theAcl = findAclWithName(aclStatus, "user2");
+
+    Assert.assertNotNull(theAcl);
+    Assert.assertEquals(FsAction.READ_WRITE,
+        aclStatus.getEffectivePermission(theAcl));
+  }
+
   @Test
   @TestDir
   @TestJetty

+ 8 - 2
hadoop-hdfs-project/hadoop-hdfs-httpfs/src/test/java/org/apache/hadoop/fs/http/server/TestHttpFSServerWebServer.java

@@ -22,9 +22,12 @@ import java.io.File;
 import java.io.InputStreamReader;
 import java.net.HttpURLConnection;
 import java.net.URL;
+import java.nio.charset.Charset;
 import java.text.MessageFormat;
 
+import org.apache.commons.io.FileUtils;
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.security.authentication.server.AuthenticationFilter;
 import org.apache.hadoop.test.GenericTestUtils;
 import org.apache.hadoop.test.HadoopUsersConfTestHelper;
 import org.junit.Assert;
@@ -57,7 +60,8 @@ public class TestHttpFSServerWebServer {
     System.setProperty("httpfs.home.dir", homeDir.getAbsolutePath());
     System.setProperty("httpfs.log.dir", logsDir.getAbsolutePath());
     System.setProperty("httpfs.config.dir", confDir.getAbsolutePath());
-    new File(confDir, "httpfs-signature.secret").createNewFile();
+    FileUtils.writeStringToFile(new File(confDir, "httpfs-signature.secret"),
+        "foo", Charset.forName("UTF-8"));
   }
 
   @Before
@@ -65,6 +69,8 @@ public class TestHttpFSServerWebServer {
     Configuration conf = new Configuration();
     conf.set(HttpFSServerWebServer.HTTP_HOSTNAME_KEY, "localhost");
     conf.setInt(HttpFSServerWebServer.HTTP_PORT_KEY, 0);
+    conf.set(AuthenticationFilter.SIGNATURE_SECRET_FILE,
+        "httpfs-signature.secret");
     Configuration sslConf = new Configuration();
     webServer = new HttpFSServerWebServer(conf, sslConf);
   }
@@ -76,7 +82,7 @@ public class TestHttpFSServerWebServer {
     URL url = new URL(webServer.getUrl(), MessageFormat.format(
         "/webhdfs/v1/?user.name={0}&op=liststatus", user));
     HttpURLConnection conn = (HttpURLConnection) url.openConnection();
-    Assert.assertEquals(conn.getResponseCode(), HttpURLConnection.HTTP_OK);
+    Assert.assertEquals(HttpURLConnection.HTTP_OK, conn.getResponseCode());
     BufferedReader reader = new BufferedReader(
         new InputStreamReader(conn.getInputStream()));
     reader.readLine();

+ 9 - 0
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java

@@ -419,6 +419,15 @@ public class DFSConfigKeys extends CommonConfigurationKeys {
       "dfs.namenode.snapshot.max.limit";
 
   public static final int DFS_NAMENODE_SNAPSHOT_MAX_LIMIT_DEFAULT = 65536;
+  public static final String DFS_NAMENODE_SNAPSHOT_SKIPLIST_SKIP_INTERVAL =
+      "dfs.namenode.snapshot.skiplist.interval";
+  public static final int DFS_NAMENODE_SNAPSHOT_SKIPLIST_SKIP_INTERVAL_DEFAULT =
+      10;
+  public static final String DFS_NAMENODE_SNAPSHOT_SKIPLIST_MAX_LEVELS =
+      "dfs.namenode.snapshot.skiplist.max.levels";
+  public static final int
+      DFS_NAMENODE_SNAPSHOT_SKIPLIST_MAX_SKIP_LEVELS_DEFAULT = 0;
+
   // Whether to enable datanode's stale state detection and usage for reads
   public static final String DFS_NAMENODE_AVOID_STALE_DATANODE_FOR_READ_KEY = "dfs.namenode.avoid.read.stale.datanode";
   public static final boolean DFS_NAMENODE_AVOID_STALE_DATANODE_FOR_READ_DEFAULT = false;

+ 2 - 1
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/NamenodeProtocolServerSideTranslatorPB.java

@@ -86,7 +86,8 @@ public class NamenodeProtocolServerSideTranslatorPB implements
         .build();
     BlocksWithLocations blocks;
     try {
-      blocks = impl.getBlocks(dnInfo, request.getSize());
+      blocks = impl.getBlocks(dnInfo, request.getSize(),
+          request.getMinBlockSize());
     } catch (IOException e) {
       throw new ServiceException(e);
     }

+ 3 - 2
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/NamenodeProtocolTranslatorPB.java

@@ -99,11 +99,12 @@ public class NamenodeProtocolTranslatorPB implements NamenodeProtocol,
   }
 
   @Override
-  public BlocksWithLocations getBlocks(DatanodeInfo datanode, long size)
+  public BlocksWithLocations getBlocks(DatanodeInfo datanode, long size, long
+      minBlockSize)
       throws IOException {
     GetBlocksRequestProto req = GetBlocksRequestProto.newBuilder()
         .setDatanode(PBHelperClient.convert((DatanodeID)datanode)).setSize(size)
-        .build();
+        .setMinBlockSize(minBlockSize).build();
     try {
       return PBHelper.convert(rpcProxy.getBlocks(NULL_CONTROLLER, req)
           .getBlocks());

+ 1 - 1
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/balancer/Dispatcher.java

@@ -785,7 +785,7 @@ public class Dispatcher {
     private long getBlockList() throws IOException {
       final long size = Math.min(getBlocksSize, blocksToReceive);
       final BlocksWithLocations newBlksLocs =
-          nnc.getBlocks(getDatanodeInfo(), size);
+          nnc.getBlocks(getDatanodeInfo(), size, getBlocksMinBlockSize);
 
       if (LOG.isTraceEnabled()) {
         LOG.trace("getBlocks(" + getDatanodeInfo() + ", "

+ 3 - 2
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/balancer/NameNodeConnector.java

@@ -162,9 +162,10 @@ public class NameNodeConnector implements Closeable {
   }
 
   /** @return blocks with locations. */
-  public BlocksWithLocations getBlocks(DatanodeInfo datanode, long size)
+  public BlocksWithLocations getBlocks(DatanodeInfo datanode, long size, long
+      minBlockSize)
       throws IOException {
-    return namenode.getBlocks(datanode, size);
+    return namenode.getBlocks(datanode, size, minBlockSize);
   }
 
   /**

+ 4 - 13
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java

@@ -408,13 +408,6 @@ public class BlockManager implements BlockStatsMXBean {
    */
   private int numBlocksPerIteration;
 
-  /**
-   * Minimum size that a block can be sent to Balancer through getBlocks.
-   * And after HDFS-8824, the small blocks are unused anyway, so there's no
-   * point to send them to balancer.
-   */
-  private long getBlocksMinBlockSize = -1;
-
   /**
    * Progress of the Reconstruction queues initialisation.
    */
@@ -539,9 +532,6 @@ public class BlockManager implements BlockStatsMXBean {
     this.numBlocksPerIteration = conf.getInt(
         DFSConfigKeys.DFS_BLOCK_MISREPLICATION_PROCESSING_LIMIT,
         DFSConfigKeys.DFS_BLOCK_MISREPLICATION_PROCESSING_LIMIT_DEFAULT);
-    this.getBlocksMinBlockSize = conf.getLongBytes(
-        DFSConfigKeys.DFS_BALANCER_GETBLOCKS_MIN_BLOCK_SIZE_KEY,
-        DFSConfigKeys.DFS_BALANCER_GETBLOCKS_MIN_BLOCK_SIZE_DEFAULT);
 
     final int minMaintenanceR = conf.getInt(
         DFSConfigKeys.DFS_NAMENODE_MAINTENANCE_REPLICATION_MIN_KEY,
@@ -1469,7 +1459,8 @@ public class BlockManager implements BlockStatsMXBean {
 
   /** Get all blocks with location information from a datanode. */
   public BlocksWithLocations getBlocksWithLocations(final DatanodeID datanode,
-      final long size) throws UnregisteredNodeException {
+      final long size, final long minBlockSize) throws
+      UnregisteredNodeException {
     final DatanodeDescriptor node = getDatanodeManager().getDatanode(datanode);
     if (node == null) {
       blockLog.warn("BLOCK* getBlocks: Asking for blocks from an" +
@@ -1491,7 +1482,7 @@ public class BlockManager implements BlockStatsMXBean {
     while(totalSize<size && iter.hasNext()) {
       curBlock = iter.next();
       if(!curBlock.isComplete())  continue;
-      if (curBlock.getNumBytes() < getBlocksMinBlockSize) {
+      if (curBlock.getNumBytes() < minBlockSize) {
         continue;
       }
       totalSize += addBlock(curBlock, results);
@@ -1501,7 +1492,7 @@ public class BlockManager implements BlockStatsMXBean {
       for(int i=0; i<startBlock&&totalSize<size; i++) {
         curBlock = iter.next();
         if(!curBlock.isComplete())  continue;
-        if (curBlock.getNumBytes() < getBlocksMinBlockSize) {
+        if (curBlock.getNumBytes() < minBlockSize) {
           continue;
         }
         totalSize += addBlock(curBlock, results);

+ 1 - 1
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNode.java

@@ -3579,7 +3579,7 @@ public class DataNode extends ReconfigurableBase
               + " Disk balancing not permitted.",
           DiskBalancerException.Result.DATANODE_STATUS_NOT_REGULAR);
     }
-    // TODO : Support force option
+
     this.diskBalancer.submitPlan(planID, planVersion, planFile, planData,
             skipDateCheck);
   }

+ 2 - 2
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DiskBalancer.java

@@ -958,8 +958,8 @@ public class DiskBalancer {
       ExtendedBlock block = null;
       while (block == null && currentCount < poolIters.size()) {
         currentCount++;
-        poolIndex = poolIndex++ % poolIters.size();
-        FsVolumeSpi.BlockIterator currentPoolIter = poolIters.get(poolIndex);
+        int index = poolIndex++ % poolIters.size();
+        FsVolumeSpi.BlockIterator currentPoolIter = poolIters.get(index);
         block = getBlockToCopy(currentPoolIter, item);
       }
 

+ 2 - 1
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/web/webhdfs/WebHdfsHandler.java

@@ -58,6 +58,7 @@ import java.io.OutputStream;
 import java.net.InetSocketAddress;
 import java.net.URI;
 import java.net.URISyntaxException;
+import java.net.URLDecoder;
 import java.nio.charset.StandardCharsets;
 import java.security.PrivilegedExceptionAction;
 import java.util.EnumSet;
@@ -127,7 +128,7 @@ public class WebHdfsHandler extends SimpleChannelInboundHandler<HttpRequest> {
     params = new ParameterParser(queryString, conf);
     DataNodeUGIProvider ugiProvider = new DataNodeUGIProvider(params);
     ugi = ugiProvider.ugi();
-    path = params.path();
+    path = URLDecoder.decode(params.path(), "UTF-8");
 
     injectToken();
     ugi.doAs(new PrivilegedExceptionAction<Void>() {

+ 16 - 4
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/diskbalancer/command/ExecuteCommand.java

@@ -48,6 +48,8 @@ public class ExecuteCommand extends Command {
     super(conf);
     addValidCommandParameters(DiskBalancerCLI.EXECUTE,
         "Executes a given plan.");
+    addValidCommandParameters(DiskBalancerCLI.SKIPDATECHECK,
+        "skips the date check and force execute the plan");
   }
 
   /**
@@ -69,7 +71,16 @@ public class ExecuteCommand extends Command {
     try (FSDataInputStream plan = open(planFile)) {
       planData = IOUtils.toString(plan);
     }
-    submitPlan(planFile, planData);
+
+    boolean skipDateCheck = false;
+    if(cmd.hasOption(DiskBalancerCLI.SKIPDATECHECK)) {
+      skipDateCheck = true;
+      LOG.warn("Skipping date check on this plan. This could mean we are " +
+          "executing an old plan and may not be the right plan for this " +
+          "data node.");
+    }
+
+    submitPlan(planFile, planData, skipDateCheck);
   }
 
   /**
@@ -77,9 +88,11 @@ public class ExecuteCommand extends Command {
    *
    * @param planFile - Plan file name
    * @param planData - Plan data in json format
+   * @param skipDateCheck - skips date check
    * @throws IOException
    */
-  private void submitPlan(final String planFile, final String planData)
+  private void submitPlan(final String planFile, final String planData,
+                          boolean skipDateCheck)
           throws IOException {
     Preconditions.checkNotNull(planData);
     NodePlan plan = NodePlan.parseJson(planData);
@@ -88,9 +101,8 @@ public class ExecuteCommand extends Command {
     ClientDatanodeProtocol dataNode = getDataNodeProxy(dataNodeAddress);
     String planHash = DigestUtils.shaHex(planData);
     try {
-      // TODO : Support skipping date check.
       dataNode.submitDiskBalancerPlan(planHash, DiskBalancerCLI.PLAN_VERSION,
-                                      planFile, planData, false);
+                                      planFile, planData, skipDateCheck);
     } catch (DiskBalancerException ex) {
       LOG.error("Submitting plan on  {} failed. Result: {}, Message: {}",
           plan.getNodeName(), ex.getResult().toString(), ex.getMessage());

+ 13 - 5
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/federation/router/Router.java

@@ -101,7 +101,7 @@ public class Router extends CompositeService {
   /** Interface to identify the active NN for a nameservice or blockpool ID. */
   private ActiveNamenodeResolver namenodeResolver;
   /** Updates the namenode status in the namenode resolver. */
-  private Collection<NamenodeHeartbeatService> namenodeHearbeatServices;
+  private Collection<NamenodeHeartbeatService> namenodeHeartbeatServices;
 
   /** Router metrics. */
   private RouterMetricsService metrics;
@@ -196,13 +196,13 @@ public class Router extends CompositeService {
         DFSConfigKeys.DFS_ROUTER_HEARTBEAT_ENABLE_DEFAULT)) {
 
       // Create status updater for each monitored Namenode
-      this.namenodeHearbeatServices = createNamenodeHearbeatServices();
+      this.namenodeHeartbeatServices = createNamenodeHeartbeatServices();
       for (NamenodeHeartbeatService hearbeatService :
-          this.namenodeHearbeatServices) {
+          this.namenodeHeartbeatServices) {
         addService(hearbeatService);
       }
 
-      if (this.namenodeHearbeatServices.isEmpty()) {
+      if (this.namenodeHeartbeatServices.isEmpty()) {
         LOG.error("Heartbeat is enabled but there are no namenodes to monitor");
       }
 
@@ -411,7 +411,7 @@ public class Router extends CompositeService {
    * @return List of heartbeat services.
    */
   protected Collection<NamenodeHeartbeatService>
-      createNamenodeHearbeatServices() {
+      createNamenodeHeartbeatServices() {
 
     Map<String, NamenodeHeartbeatService> ret = new HashMap<>();
 
@@ -645,4 +645,12 @@ public class Router extends CompositeService {
   RouterQuotaUpdateService getQuotaCacheUpdateService() {
     return this.quotaUpdateService;
   }
+
+  /**
+   * Get the list of namenode heartbeat service.
+   */
+  @VisibleForTesting
+  Collection<NamenodeHeartbeatService> getNamenodeHearbeatServices() {
+    return this.namenodeHeartbeatServices;
+  }
 }

+ 4 - 3
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java

@@ -1718,13 +1718,14 @@ public class FSNamesystem implements Namesystem, FSNamesystemMBean,
    * @param datanode on which blocks are located
    * @param size total size of blocks
    */
-  public BlocksWithLocations getBlocks(DatanodeID datanode, long size)
-      throws IOException {
+  public BlocksWithLocations getBlocks(DatanodeID datanode, long size, long
+      minimumBlockSize) throws IOException {
     checkOperation(OperationCategory.READ);
     readLock();
     try {
       checkOperation(OperationCategory.READ);
-      return getBlockManager().getBlocksWithLocations(datanode, size);
+      return getBlockManager().getBlocksWithLocations(datanode, size,
+          minimumBlockSize);
     } finally {
       readUnlock("getBlocks");
     }

+ 30 - 5
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeRpcServer.java

@@ -34,6 +34,7 @@ import static org.apache.hadoop.util.Time.now;
 import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.net.InetSocketAddress;
+import java.security.PrivilegedExceptionAction;
 import java.util.Arrays;
 import java.util.Collection;
 import java.util.EnumSet;
@@ -189,6 +190,7 @@ import org.apache.hadoop.ipc.RefreshResponse;
 import org.apache.hadoop.net.Node;
 import org.apache.hadoop.security.AccessControlException;
 import org.apache.hadoop.security.Groups;
+import org.apache.hadoop.security.SecurityUtil;
 import org.apache.hadoop.security.UserGroupInformation;
 import org.apache.hadoop.security.authorize.AuthorizationException;
 import org.apache.hadoop.security.authorize.ProxyUsers;
@@ -616,15 +618,20 @@ public class NameNodeRpcServer implements NamenodeProtocols {
   // NamenodeProtocol
   /////////////////////////////////////////////////////
   @Override // NamenodeProtocol
-  public BlocksWithLocations getBlocks(DatanodeInfo datanode, long size)
-  throws IOException {
+  public BlocksWithLocations getBlocks(DatanodeInfo datanode, long size, long
+      minBlockSize)
+      throws IOException {
     if(size <= 0) {
       throw new IllegalArgumentException(
-        "Unexpected not positive size: "+size);
+          "Unexpected not positive size: "+size);
+    }
+    if(minBlockSize < 0) {
+      throw new IllegalArgumentException(
+          "Unexpected not positive size: "+size);
     }
     checkNNStartup();
     namesystem.checkSuperuserPrivilege();
-    return namesystem.getBlocks(datanode, size);
+    return namesystem.getBlocks(datanode, size, minBlockSize);
   }
 
   @Override // NamenodeProtocol
@@ -2253,6 +2260,24 @@ public class NameNodeRpcServer implements NamenodeProtocols {
     // guaranteed to have been written by this NameNode.)
     boolean readInProgress = syncTxid > 0;
 
+    // doas the NN login user for the actual operations to get edits.
+    // Notably this is necessary when polling from the remote edits via https.
+    // We have validated the client is a superuser from the NN RPC, so this
+    // running as the login user here is safe.
+    EventBatchList ret = SecurityUtil.doAsLoginUser(
+        new PrivilegedExceptionAction<EventBatchList>() {
+          @Override
+          public EventBatchList run() throws IOException {
+            return getEventBatchList(syncTxid, txid, log, readInProgress,
+                maxEventsPerRPC);
+          }
+        });
+    return ret;
+  }
+
+  private EventBatchList getEventBatchList(long syncTxid, long txid,
+      FSEditLog log, boolean readInProgress, int maxEventsPerRPC)
+      throws IOException {
     List<EventBatch> batches = Lists.newArrayList();
     int totalEvents = 0;
     long maxSeenTxid = -1;
@@ -2271,7 +2296,7 @@ public class NameNodeRpcServer implements NamenodeProtocols {
       // and are using QJM -- the edit log will be closed and this exception
       // will result
       LOG.info("NN is transitioning from active to standby and FSEditLog " +
-      "is closed -- could not read edits");
+          "is closed -- could not read edits");
       return new EventBatchList(batches, firstSeenTxid, maxSeenTxid, syncTxid);
     }
 

+ 16 - 10
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/snapshot/AbstractINodeDiffList.java

@@ -138,10 +138,14 @@ abstract class AbstractINodeDiffList<N extends INode,
     return n == 0 ? null : diffs.get(n - 1);
   }
 
+  DiffList<D> newDiffs() {
+    return new DiffListByArrayList<>(
+        INodeDirectory.DEFAULT_FILES_PER_DIRECTORY);
+  }
+
   private void createDiffsIfNeeded() {
     if (diffs == null) {
-      diffs =
-          new DiffListByArrayList<>(INodeDirectory.DEFAULT_FILES_PER_DIRECTORY);
+      diffs = newDiffs();
     }
   }
 
@@ -231,6 +235,12 @@ abstract class AbstractINodeDiffList<N extends INode,
     return diff == null ? Snapshot.CURRENT_STATE_ID : diff.getSnapshotId();
   }
 
+  public final int getDiffIndexById(final int snapshotId) {
+    int diffIndex = diffs.binarySearch(snapshotId);
+    diffIndex = diffIndex < 0 ? (-diffIndex - 1) : diffIndex;
+    return diffIndex;
+  }
+
   final int[] changedBetweenSnapshots(Snapshot from, Snapshot to) {
     if (diffs == null) {
       return null;
@@ -243,10 +253,10 @@ abstract class AbstractINodeDiffList<N extends INode,
     }
 
     final int size = diffs.size();
-    int earlierDiffIndex = diffs.binarySearch(earlier.getId());
+    int earlierDiffIndex = getDiffIndexById(earlier.getId());
     int laterDiffIndex = later == null ? size
-        : diffs.binarySearch(later.getId());
-    if (-earlierDiffIndex - 1 == size) {
+        : getDiffIndexById(later.getId());
+    if (earlierDiffIndex == size) {
       // if the earlierSnapshot is after the latest SnapshotDiff stored in
       // diffs, no modification happened after the earlierSnapshot
       return null;
@@ -256,10 +266,6 @@ abstract class AbstractINodeDiffList<N extends INode,
       // before it, no modification happened before the laterSnapshot
       return null;
     }
-    earlierDiffIndex = earlierDiffIndex < 0 ? (-earlierDiffIndex - 1)
-        : earlierDiffIndex;
-    laterDiffIndex = laterDiffIndex < 0 ? (-laterDiffIndex - 1)
-        : laterDiffIndex;
     return new int[]{earlierDiffIndex, laterDiffIndex};
   }
 
@@ -300,7 +306,7 @@ abstract class AbstractINodeDiffList<N extends INode,
     }
     return diff;
   }
-  
+
   @Override
   public Iterator<D> iterator() {
     return diffs != null ? diffs.iterator() : Collections.emptyIterator();

+ 131 - 91
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/snapshot/DirectoryDiffList.java → hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/snapshot/DiffListBySkipList.java

@@ -17,6 +17,7 @@
  */
 package org.apache.hadoop.hdfs.server.namenode.snapshot;
 
+import com.google.common.base.Preconditions;
 import org.apache.hadoop.hdfs.server.namenode.INodeDirectory;
 import org.apache.hadoop.hdfs.server.namenode.snapshot.
     DirectoryWithSnapshotFeature.DirectoryDiff;
@@ -30,9 +31,7 @@ import java.util.ArrayList;
 import java.util.Iterator;
 import java.util.Arrays;
 import java.util.Collections;
-import java.util.Random;
 import java.util.Objects;
-import java.util.concurrent.ThreadLocalRandom;
 
 /**
  * SkipList is an implementation of a data structure for storing a sorted list
@@ -68,11 +67,24 @@ import java.util.concurrent.ThreadLocalRandom;
  * <p>
  * Once a snapshot gets deleted, the list needs to be balanced.
  */
-public class DirectoryDiffList implements DiffList<DirectoryDiff> {
+public class DiffListBySkipList implements DiffList<DirectoryDiff> {
   public static final Logger LOG =
-      LoggerFactory.getLogger(DirectoryDiffList.class);
+      LoggerFactory.getLogger(DiffListBySkipList.class);
+
+  static String childrenDiff2String(ChildrenDiff diff) {
+    if (diff == null) {
+      return "null";
+    }
+    return "@" + Integer.toHexString(System.identityHashCode(diff));
+  }
+
+  static String skip2String(SkipListNode skipTo, ChildrenDiff diff) {
+    return "->" + skipTo + ":diff=" + childrenDiff2String(diff);
+  }
 
   private static class SkipDiff {
+    static final SkipDiff[] EMPTY_ARRAY = {};
+
     /**
      * The references to the subsequent nodes.
      */
@@ -104,7 +116,7 @@ public class DirectoryDiffList implements DiffList<DirectoryDiff> {
 
     @Override
     public String toString() {
-      return "->" + skipTo + (diff == null? " (diff==null)": "");
+      return skip2String(skipTo, diff);
     }
   }
 
@@ -112,17 +124,19 @@ public class DirectoryDiffList implements DiffList<DirectoryDiff> {
    * SkipListNode is an implementation of a DirectoryDiff List node,
    * which stores a Directory Diff and references to subsequent nodes.
    */
-  private final static class SkipListNode implements Comparable<Integer> {
+  final static class SkipListNode implements Comparable<Integer> {
 
     /**
      * The data element stored in this node.
      */
-    private DirectoryDiff diff;
+    private final DirectoryDiff diff;
 
+    /** Next node. */
+    private SkipListNode next;
     /**
-     * List containing combined children diffs over a skip interval.
+     * Array containing combined children diffs over a skip interval.
      */
-    private List<SkipDiff> skipDiffList;
+    private SkipDiff[] skips;
 
     /**
      * Constructs a new instance of SkipListNode with the specified data element
@@ -132,20 +146,28 @@ public class DirectoryDiffList implements DiffList<DirectoryDiff> {
      */
     SkipListNode(DirectoryDiff diff, int level) {
       this.diff = diff;
-      skipDiffList = new ArrayList<>(level + 1);
+
+      this.skips = level > 0? new SkipDiff[level]: SkipDiff.EMPTY_ARRAY;
+      for(int i = 0; i < skips.length; i++) {
+        skips[i] = new SkipDiff(null);
+      }
     }
 
     /**
      * Returns the level of this SkipListNode.
      */
     public int level() {
-      return skipDiffList.size() - 1;
+      return skips.length;
     }
 
     void trim() {
-      for (int level = level();
-           level > 0 && getSkipNode(level) == null; level--) {
-        skipDiffList.remove(level);
+      int n = skips.length - 1;
+      for (; n >= 0 && skips[n] == null; n--) {
+        continue;
+      }
+      n++;
+      if (n < skips.length) {
+        skips = n > 0 ? Arrays.copyOf(skips, n) : SkipDiff.EMPTY_ARRAY;
       }
     }
 
@@ -179,40 +201,66 @@ public class DirectoryDiffList implements DiffList<DirectoryDiff> {
     }
 
     public void setSkipDiff(ChildrenDiff cDiff, int level) {
-      if (level < skipDiffList.size()) {
-        skipDiffList.get(level).setDiff(cDiff);
-      } else {
-        skipDiffList.add(new SkipDiff(cDiff));
+      Preconditions.checkArgument(level > 0);
+      resize(level);
+      skips[level - 1].setDiff(cDiff);
+    }
+
+    void setSkipDiff4Target(
+        SkipListNode target, int startLevel, ChildrenDiff childrenDiff) {
+      for(int i = startLevel; i <= level(); i++) {
+        if (getSkipNode(i) != target) {
+          return;
+        }
+        setSkipDiff(childrenDiff, i);
+      }
+    }
+
+    private void resize(int newLevel) {
+      int i = skips.length;
+      if (i < newLevel) {
+        skips = Arrays.copyOf(skips, newLevel);
+        for (; i < newLevel; i++) {
+          skips[i] = new SkipDiff(null);
+        }
       }
     }
 
     public void setSkipTo(SkipListNode node, int level) {
-      for (int i = skipDiffList.size(); i <= level; i++) {
-        skipDiffList.add(new SkipDiff(null));
+      if (level == 0) {
+        next = node;
+      } else {
+        resize(level);
+        skips[level - 1].setSkipTo(node);
       }
-      skipDiffList.get(level).setSkipTo(node);
     }
 
     public ChildrenDiff getChildrenDiff(int level) {
       if (level == 0) {
-        return diff.getChildrenDiff();
+        return diff != null? diff.getChildrenDiff(): null;
       } else {
-        return skipDiffList.get(level).getDiff();
+        return skips[level - 1].getDiff();
       }
     }
 
     SkipListNode getSkipNode(int level) {
-      if (level >= skipDiffList.size()) {
-        return null;
-      } else {
-        return skipDiffList.get(level).getSkipTo();
-      }
+      return level == 0? next
+          : level <= skips.length? skips[level - 1].getSkipTo()
+          : null;
     }
 
     @Override
     public String toString() {
       return diff != null ? "" + diff.getSnapshotId() : "?";
     }
+
+    StringBuilder appendTo(StringBuilder b) {
+      b.append(this).append(": ").append(skip2String(next, getChildrenDiff(0)));
+      for(int i = 0; i < skips.length; i++) {
+        b.append(", ").append(skips[i]);
+      }
+      return b;
+    }
   }
 
   /**
@@ -220,17 +268,7 @@ public class DirectoryDiffList implements DiffList<DirectoryDiff> {
    * The list will grow linearly once a new Directory diff gets added.
    * All the list inteface defined methods provide a linear view of the list.
    */
-  private List<SkipListNode> skipNodeList;
-
-  /**
-   * The max no of skipLevels.
-   */
-  private final int maxSkipLevels;
-
-  /**
-   * The no of diffs after which the level promotion happens.
-   */
-  private final int skipInterval;
+  private final List<SkipListNode> skipNodeList;
 
   /**
    * The head node to the list.
@@ -240,11 +278,9 @@ public class DirectoryDiffList implements DiffList<DirectoryDiff> {
   /**
    * Constructs a new, empty instance of SkipList.
    */
-  public DirectoryDiffList(int capacity, int interval, int skipLevel) {
+  public DiffListBySkipList(int capacity) {
     skipNodeList = new ArrayList<>(capacity);
     head = new SkipListNode(null, 0);
-    this.maxSkipLevels = skipLevel;
-    this.skipInterval = interval;
   }
 
   /**
@@ -254,13 +290,9 @@ public class DirectoryDiffList implements DiffList<DirectoryDiff> {
    */
   @Override
   public void addFirst(DirectoryDiff diff) {
-    final int nodeLevel = randomLevel(skipInterval, maxSkipLevels);
+    final int nodeLevel = DirectoryDiffListFactory.randomLevel();
     final SkipListNode[] nodePath = new SkipListNode[nodeLevel + 1];
-
     Arrays.fill(nodePath, head);
-    for (int level = head.level() + 1; level <= nodeLevel; level++) {
-      head.skipDiffList.add(new SkipDiff(null));
-    }
 
     final SkipListNode newNode = new SkipListNode(diff, nodeLevel);
     for (int level = 0; level <= nodeLevel; level++) {
@@ -316,15 +348,10 @@ public class DirectoryDiffList implements DiffList<DirectoryDiff> {
    */
   @Override
   public boolean addLast(DirectoryDiff diff) {
-    final int nodeLevel = randomLevel(skipInterval, maxSkipLevels);
-    final int headLevel = head.level();
+    final int nodeLevel = DirectoryDiffListFactory.randomLevel();
     final SkipListNode[] nodePath = findPreviousNodes(null, nodeLevel);
-    for (int level = headLevel + 1; level <= nodeLevel; level++) {
-      head.skipDiffList.add(new SkipDiff(null));
-      nodePath[level] = head;
-    }
 
-    final SkipListNode current = new SkipListNode(diff, nodeLevel);
+    final SkipListNode newNode = new SkipListNode(diff, nodeLevel);
     for (int level = 0; level <= nodeLevel; level++) {
       if (level > 0 && nodePath[level] != head) {
         //  suppose the list is like:
@@ -339,22 +366,24 @@ public class DirectoryDiffList implements DiffList<DirectoryDiff> {
         //  level 0:head->    s1->s2->s3->s4->s5->s6->s7->s8->s9---->s10
         //  At level 1, we combine s5, s6, s7, s8, s9 and store as s5'
         //  At level 2, we combine s1', s3', s5' and form s1'' and store at s1.
-        // Note : the last element(elemnt being added) diff is not added while
+        // Note : the last element(element being added) diff is not added while
         // combining the diffs.
-        ChildrenDiff combined = combineDiff(nodePath[level], current, level);
+        ChildrenDiff combined = combineDiff(nodePath[level], newNode, level);
         if (combined != null) {
           nodePath[level].setSkipDiff(combined, level);
         }
       }
-      nodePath[level].setSkipTo(current, level);
-      current.setSkipTo(null, level);
+      nodePath[level].setSkipTo(newNode, level);
+      newNode.setSkipTo(null, level);
     }
-    return skipNodeList.add(current);
+    return skipNodeList.add(newNode);
   }
 
   private static ChildrenDiff combineDiff(SkipListNode from, SkipListNode to,
       int level) {
     ChildrenDiff combined = null;
+    ChildrenDiff first = null;
+
     SkipListNode cur = from;
     for (int i = level - 1; i >= 0; i--) {
       while (cur != to) {
@@ -362,14 +391,20 @@ public class DirectoryDiffList implements DiffList<DirectoryDiff> {
         if (next == null) {
           break;
         }
-        if (combined == null) {
-          combined = new ChildrenDiff();
+
+        if (first == null) {
+          first = cur.getChildrenDiff(i);
+        } else {
+          if (combined == null) {
+            combined = new ChildrenDiff();
+            combined.combinePosterior(first, null);
+          }
+          combined.combinePosterior(cur.getChildrenDiff(i), null);
         }
-        combined.combinePosterior(cur.getChildrenDiff(i), null);
         cur = next;
       }
     }
-    return combined;
+    return combined != null? combined: first;
   }
 
   /**
@@ -383,6 +418,10 @@ public class DirectoryDiffList implements DiffList<DirectoryDiff> {
     return skipNodeList.get(index).getDiff();
   }
 
+  SkipListNode getSkipListNode(int i) {
+    return skipNodeList.get(i);
+  }
+
   /**
    * Removes the element at the specified position in this list.
    *
@@ -391,12 +430,20 @@ public class DirectoryDiffList implements DiffList<DirectoryDiff> {
    */
   @Override
   public DirectoryDiff remove(int index) {
-    SkipListNode node = getNode(index);
+    final SkipListNode node = getNode(index);
+
     int headLevel = head.level();
     int nodeLevel = node.level();
     final SkipListNode[] nodePath = findPreviousNodes(node, nodeLevel);
+
     for (int level = 0; level <= nodeLevel; level++) {
-      if (nodePath[level] != head && level > 0) {
+      final SkipListNode previous = nodePath[level];
+      final SkipListNode next = node.getSkipNode(level);
+      if (level == 0) {
+        if (next != null) {
+          previous.setSkipDiff4Target(next, 1, previous.getChildrenDiff(0));
+        }
+      } else if (previous != head) {
         // if the last snapshot is deleted, for all the skip level nodes
         // pointing to the last one, the combined children diff at each level
         // > 0 should be made null and skip pointers will be updated to null.
@@ -404,8 +451,8 @@ public class DirectoryDiffList implements DiffList<DirectoryDiff> {
         // the diff of deleted node at each level to the previous skip level
         // node at that level and the skip pointers will be updated to point to
         // the skip nodes of the deleted node.
-        if (index == size() - 1) {
-          nodePath[level].setSkipDiff(null, level);
+        if (next == null) {
+          previous.setSkipDiff(null, level);
         } else {
           /* Ideally at level 0, the deleted diff will be combined with
            * the previous diff , and deleted inodes will be cleaned up
@@ -414,12 +461,23 @@ public class DirectoryDiffList implements DiffList<DirectoryDiff> {
            * {@link AbstractINodeDiffList#deleteSnapshotDiff} function.
            */
           if (node.getChildrenDiff(level) != null) {
-            nodePath[level].getChildrenDiff(level)
-                .combinePosterior(node.getChildrenDiff(level), null);
+            final ChildrenDiff combined;
+            if (previous == nodePath[level - 1]
+                && next == node.getSkipNode(level - 1)) {
+              combined = nodePath[level - 1].getChildrenDiff(level - 1);
+              previous.setSkipDiff4Target(next, level + 1, combined);
+            } else if (next == previous.getSkipNode(level + 1)) {
+              combined = previous.getChildrenDiff(level + 1);
+            } else {
+              combined = new ChildrenDiff();
+              combined.combinePosterior(previous.getChildrenDiff(level), null);
+              combined.combinePosterior(node.getChildrenDiff(level), null);
+            }
+            previous.setSkipDiff(combined, level);
           }
         }
       }
-      nodePath[level].setSkipTo(node.getSkipNode(level), level);
+      previous.setSkipTo(next, level);
     }
     if (nodeLevel == headLevel) {
       head.trim();
@@ -478,24 +536,6 @@ public class DirectoryDiffList implements DiffList<DirectoryDiff> {
     return skipNodeList.get(index);
   }
 
-  /**
-   * Returns the level of the skipList node.
-   *
-   * @param skipInterval The max interval after which the next level promotion
-   *                     should happen.
-   * @param maxLevel     Maximum no of skip levels
-   * @return A value in the range 0 to maxLevel-1.
-   */
-  static int randomLevel(int skipInterval, int maxLevel) {
-    final Random r = ThreadLocalRandom.current();
-    for (int level = 0; level < maxLevel; level++) {
-      // skip to the next level with probability 1/skipInterval
-      if (r.nextInt(skipInterval) > 0) {
-        return level;
-      }
-    }
-    return maxLevel;
-  }
 
   /**
    * This function returns the minimal set of diffs required to combine in
@@ -535,10 +575,10 @@ public class DirectoryDiffList implements DiffList<DirectoryDiff> {
 
   @Override
   public String toString() {
-    final StringBuilder b = new StringBuilder(getClass().getSimpleName());
-    b.append(" head: ").append(head).append(head.skipDiffList);
+    final StringBuilder b = new StringBuilder().append(" head: ");
+    head.appendTo(b);
     for (SkipListNode n : skipNodeList) {
-      b.append("\n  ").append(n).append(n.skipDiffList);
+      n.appendTo(b.append("\n  "));
     }
     return b.toString();
   }

+ 70 - 0
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/snapshot/DirectoryDiffListFactory.java

@@ -0,0 +1,70 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs.server.namenode.snapshot;
+
+import org.apache.hadoop.hdfs.server.namenode.snapshot.DirectoryWithSnapshotFeature.DirectoryDiff;
+import org.slf4j.Logger;
+
+import java.util.Random;
+import java.util.concurrent.ThreadLocalRandom;
+import java.util.function.IntFunction;
+
+/** For creating {@link DiffList} for {@link DirectoryDiff}. */
+public abstract class DirectoryDiffListFactory {
+  public static DiffList<DirectoryDiff> createDiffList(int capacity) {
+    return constructor.apply(capacity);
+  }
+
+  public static void init(int interval, int maxSkipLevels, Logger log) {
+    DirectoryDiffListFactory.skipInterval = interval;
+    DirectoryDiffListFactory.maxLevels = maxSkipLevels;
+
+    if (maxLevels > 0) {
+      constructor = c -> new DiffListBySkipList(c);
+      log.info("SkipList is enabled with skipInterval=" + skipInterval
+          + ", maxLevels=" + maxLevels);
+    } else {
+      constructor = c -> new DiffListByArrayList<>(c);
+      log.info("SkipList is disabled");
+    }
+  }
+
+  private static volatile IntFunction<DiffList<DirectoryDiff>> constructor
+      = c -> new DiffListByArrayList<>(c);
+
+  private static volatile int skipInterval;
+  private static volatile int maxLevels;
+
+  /**
+   * Returns the level of a skip list node.
+   * @return A value in the range 0 to maxLevels.
+   */
+  public static int randomLevel() {
+    final Random r = ThreadLocalRandom.current();
+    for (int level = 0; level < maxLevels; level++) {
+      // skip to the next level with probability 1/skipInterval
+      if (r.nextInt(skipInterval) > 0) {
+        return level;
+      }
+    }
+    return maxLevels;
+  }
+
+
+  private DirectoryDiffListFactory() {}
+}

+ 34 - 8
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/snapshot/DirectoryWithSnapshotFeature.java

@@ -228,12 +228,18 @@ public class DirectoryWithSnapshotFeature implements INode.Feature {
         private List<INode> initChildren() {
           if (children == null) {
             final ChildrenDiff combined = new ChildrenDiff();
-            for (DirectoryDiff d = DirectoryDiff.this; d != null;
-                d = d.getPosterior()) {
+            DirectoryDiffList directoryDiffList =
+                currentDir.getDirectoryWithSnapshotFeature().diffs;
+            final int diffIndex =
+                directoryDiffList.getDiffIndexById(getSnapshotId());
+            List<DirectoryDiff> diffList = directoryDiffList
+                .getDiffListBetweenSnapshots(diffIndex,
+                    directoryDiffList.asList().size(), currentDir);
+            for (DirectoryDiff d : diffList) {
               combined.combinePosterior(d.diff, null);
             }
-            children = combined.apply2Current(ReadOnlyList.Util.asList(
-                currentDir.getChildrenList(Snapshot.CURRENT_STATE_ID)));
+            children = combined.apply2Current(ReadOnlyList.Util
+                .asList(currentDir.getChildrenList(Snapshot.CURRENT_STATE_ID)));
           }
           return children;
         }
@@ -334,6 +340,12 @@ public class DirectoryWithSnapshotFeature implements INode.Feature {
         : new INodeDirectoryAttributes.SnapshotCopy(currentDir);
     }
 
+    @Override
+    DiffList<DirectoryDiff> newDiffs() {
+      return DirectoryDiffListFactory
+          .createDiffList(INodeDirectory.DEFAULT_FILES_PER_DIRECTORY);
+    }
+
     /** Replace the given child in the created/deleted list, if there is any. */
     public boolean replaceChild(final ListType type, final INode oldChild,
         final INode newChild) {
@@ -377,6 +389,19 @@ public class DirectoryWithSnapshotFeature implements INode.Feature {
       }
       return NO_SNAPSHOT_ID;
     }
+
+    /**
+     * Returns the list of diffs between two indexes corresponding to two
+     * snapshots.
+     * @param fromIndex Index of the diff corresponding to the earlier snapshot
+     * @param toIndex   Index of the diff corresponding to the later snapshot
+     * @param dir       The Directory to which the diffList belongs
+     * @return list of directory diffs
+     */
+    List<DirectoryDiff> getDiffListBetweenSnapshots(int fromIndex, int toIndex,
+        INodeDirectory dir) {
+      return asList().getMinListForRange(fromIndex, toIndex, dir);
+    }
   }
   
   private static Map<INode, INode> cloneDiffList(List<INode> diffList) {
@@ -591,7 +616,7 @@ public class DirectoryWithSnapshotFeature implements INode.Feature {
     return diff != null ? diff.getChild(name, true, currentINode)
         : currentINode.getChild(name, Snapshot.CURRENT_STATE_ID);
   }
-  
+
   /** Used to record the modification of a symlink node */
   public INode saveChild2Snapshot(INodeDirectory currentINode,
       final INode child, final int latestSnapshotId, final INode snapshotCopy) {
@@ -672,9 +697,10 @@ public class DirectoryWithSnapshotFeature implements INode.Feature {
 
     boolean dirMetadataChanged = false;
     INodeDirectoryAttributes dirCopy = null;
-    DiffList<DirectoryDiff> difflist = diffs.asList();
-    for (int i = earlierDiffIndex; i < laterDiffIndex; i++) {
-      DirectoryDiff sdiff = difflist.get(i);
+    List<DirectoryDiff> difflist = diffs
+        .getDiffListBetweenSnapshots(earlierDiffIndex, laterDiffIndex,
+            currentINode);
+    for (DirectoryDiff sdiff : difflist) {
       diff.combinePosterior(sdiff.diff, null);
       if (!dirMetadataChanged && sdiff.snapshotINode != null) {
         if (dirCopy == null) {

+ 12 - 3
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/snapshot/SnapshotManager.java

@@ -36,8 +36,6 @@ import java.util.concurrent.atomic.AtomicInteger;
 import javax.management.ObjectName;
 
 import com.google.common.annotations.VisibleForTesting;
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.hdfs.DFSConfigKeys;
 import org.apache.hadoop.hdfs.DFSUtil;
@@ -60,6 +58,8 @@ import org.apache.hadoop.hdfs.server.namenode.LeaseManager;
 import org.apache.hadoop.metrics2.util.MBeans;
 
 import com.google.common.base.Preconditions;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 /**
  * Manage snapshottable directories and their snapshots.
@@ -74,7 +74,8 @@ import com.google.common.base.Preconditions;
  * if necessary.
  */
 public class SnapshotManager implements SnapshotStatsMXBean {
-  public static final Log LOG = LogFactory.getLog(SnapshotManager.class);
+  public static final Logger LOG =
+      LoggerFactory.getLogger(SnapshotManager.class);
 
   private final FSDirectory fsdir;
   private boolean captureOpenFiles;
@@ -127,6 +128,14 @@ public class SnapshotManager implements SnapshotStatsMXBean {
         + snapshotDiffAllowSnapRootDescendant
         + ", maxSnapshotLimit: "
         + maxSnapshotLimit);
+
+    final int maxLevels = conf.getInt(
+        DFSConfigKeys.DFS_NAMENODE_SNAPSHOT_SKIPLIST_MAX_LEVELS,
+        DFSConfigKeys.DFS_NAMENODE_SNAPSHOT_SKIPLIST_MAX_SKIP_LEVELS_DEFAULT);
+    final int skipInterval = conf.getInt(
+        DFSConfigKeys.DFS_NAMENODE_SNAPSHOT_SKIPLIST_SKIP_INTERVAL,
+        DFSConfigKeys.DFS_NAMENODE_SNAPSHOT_SKIPLIST_SKIP_INTERVAL_DEFAULT);
+    DirectoryDiffListFactory.init(skipInterval, maxLevels, LOG);
   }
 
   @VisibleForTesting

+ 4 - 1
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/web/resources/NamenodeWebHdfsMethods.java

@@ -25,6 +25,7 @@ import java.io.PrintWriter;
 import java.net.InetAddress;
 import java.net.URI;
 import java.net.URISyntaxException;
+import java.net.URLDecoder;
 import java.net.UnknownHostException;
 import java.security.Principal;
 import java.security.PrivilegedExceptionAction;
@@ -995,7 +996,9 @@ public class NamenodeWebHdfsMethods {
     return doAs(ugi, new PrivilegedExceptionAction<Response>() {
       @Override
       public Response run() throws IOException, URISyntaxException {
-        return get(ugi, delegation, username, doAsUser, path.getAbsolutePath(),
+        String absolutePath = path.getAbsolutePath() == null ? null :
+            URLDecoder.decode(path.getAbsolutePath(), "UTF-8");
+        return get(ugi, delegation, username, doAsUser, absolutePath,
             op, offset, length, renewer, bufferSize, xattrNames, xattrEncoding,
             excludeDatanodes, fsAction, snapshotName, oldSnapshotName,
             tokenKind, tokenService, noredirect, startAfter);

+ 5 - 4
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/protocol/NamenodeProtocol.java

@@ -67,17 +67,18 @@ public interface NamenodeProtocol {
   /**
    * Get a list of blocks belonging to <code>datanode</code>
    * whose total size equals <code>size</code>.
-   * 
+   *
    * @see org.apache.hadoop.hdfs.server.balancer.Balancer
    * @param datanode  a data node
    * @param size      requested size
+   * @param minBlockSize each block should be of this minimum Block Size
    * @return          a list of blocks & their locations
    * @throws IOException if size is less than or equal to 0 or
-                                   datanode does not exist
+  datanode does not exist
    */
   @Idempotent
-  public BlocksWithLocations getBlocks(DatanodeInfo datanode, long size)
-  throws IOException;
+  BlocksWithLocations getBlocks(DatanodeInfo datanode, long size, long
+      minBlockSize) throws IOException;
 
   /**
    * Get the current block keys

+ 15 - 0
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/DiskBalancerCLI.java

@@ -84,6 +84,13 @@ public class DiskBalancerCLI extends Configured implements Tool {
    * Executes a given plan file on the target datanode.
    */
   public static final String EXECUTE = "execute";
+
+  /**
+   * Skips date check(now by default the plan is valid for 24 hours), and force
+   * execute the plan.
+   */
+  public static final String SKIPDATECHECK = "skipDateCheck";
+
   /**
    * The report command prints out a disk fragmentation report about the data
    * cluster. By default it prints the DEFAULT_TOP machines names with high
@@ -342,7 +349,15 @@ public class DiskBalancerCLI extends Configured implements Tool {
             "submits it for execution by the datanode.")
         .create();
     getExecuteOptions().addOption(execute);
+
+
+    Option skipDateCheck = OptionBuilder.withLongOpt(SKIPDATECHECK)
+        .withDescription("skips the date check and force execute the plan")
+        .create();
+    getExecuteOptions().addOption(skipDateCheck);
+
     opt.addOption(execute);
+    opt.addOption(skipDateCheck);
   }
 
   /**

+ 1 - 0
hadoop-hdfs-project/hadoop-hdfs/src/main/proto/NamenodeProtocol.proto

@@ -43,6 +43,7 @@ import "HdfsServer.proto";
 message GetBlocksRequestProto {
   required DatanodeIDProto datanode = 1; // Datanode ID
   required uint64 size = 2;              // Size in bytes
+  optional uint64 minBlockSize = 3 [default = 0]; // Minimum Block Size in bytes
 }
 
  

+ 18 - 1
hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml

@@ -4362,7 +4362,6 @@
     across to the client within one rpc call.
   </description>
 </property>
-
 <property>
   <name>dfs.namenode.snapshot.max.limit</name>
   <value>65536</value>
@@ -4373,6 +4372,24 @@
   </description>
 </property>
 
+<property>
+  <name>dfs.namenode.snapshot.skiplist.max.levels</name>
+  <value>0</value>
+  <description>
+    Maximum no of the skip levels to be maintained in the skip list for
+    storing directory snapshot diffs. By default, it is set to 0 and a linear
+    list will be used to store the directory snapshot diffs.
+  </description>
+</property>
+<property>
+  <name>dfs.namenode.snapshot.skiplist.interval</name>
+  <value>10</value>
+  <description>
+    The interval after which the skip levels will be formed in the skip list
+    for storing directory snapshot diffs. By default, value is set to 10.
+  </description>
+</property>
+
 <property>
   <name>dfs.pipeline.ecn</name>
   <value>false</value>

+ 5 - 0
hadoop-hdfs-project/hadoop-hdfs/src/site/markdown/HDFSRouterFederation.md

@@ -350,6 +350,11 @@ Monitor the namenodes in the subclusters for forwarding the client requests.
 | dfs.federation.router.monitor.namenode | | The identifier of the namenodes to monitor and heartbeat. |
 | dfs.federation.router.monitor.localnamenode.enable | `true` | If `true`, the Router should monitor the namenode in the local machine. |
 
+Note: The config *dfs.nameservice.id* is recommended to configure if *dfs.federation.router.monitor.localnamenode.enable* is enabled.
+This will allow the Router finding the local node directly. Otherwise, it will find the nameservice Id by matching namenode RPC address with the
+local node address. If multiple addresses are matched, the Router will fail to start. In addition, if the local node is in a HA mode, it is recommend
+to configure *dfs.ha.namenode.id*.
+
 ### Quota
 
 Global quota supported in federation.

+ 223 - 0
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDFSInotifyEventInputStreamKerberized.java

@@ -0,0 +1,223 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * <p>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hdfs;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.FileUtil;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hdfs.inotify.EventBatch;
+import org.apache.hadoop.hdfs.qjournal.MiniQJMHACluster;
+import org.apache.hadoop.hdfs.qjournal.TestSecureNNWithQJM;
+import org.apache.hadoop.http.HttpConfig;
+import org.apache.hadoop.ipc.Client;
+import org.apache.hadoop.minikdc.MiniKdc;
+import org.apache.hadoop.security.SecurityUtil;
+import org.apache.hadoop.security.UserGroupInformation;
+import org.apache.hadoop.security.ssl.KeyStoreTestUtil;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.Timeout;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.File;
+import java.security.PrivilegedExceptionAction;
+import java.util.Properties;
+
+import static org.apache.hadoop.fs.CommonConfigurationKeys.IPC_CLIENT_CONNECTION_IDLESCANINTERVAL_KEY;
+import static org.apache.hadoop.fs.CommonConfigurationKeys.IPC_CLIENT_CONNECT_MAX_RETRIES_ON_SASL_KEY;
+import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.HADOOP_KERBEROS_MIN_SECONDS_BEFORE_RELOGIN;
+import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.IPC_CLIENT_CONNECTION_MAXIDLETIME_KEY;
+import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_BLOCK_ACCESS_TOKEN_ENABLE_KEY;
+import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CLIENT_HTTPS_KEYSTORE_RESOURCE_KEY;
+import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_HTTPS_ADDRESS_KEY;
+import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_KERBEROS_PRINCIPAL_KEY;
+import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_KEYTAB_FILE_KEY;
+import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_HTTP_POLICY_KEY;
+import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_JOURNALNODE_HTTPS_ADDRESS_KEY;
+import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_JOURNALNODE_KERBEROS_INTERNAL_SPNEGO_PRINCIPAL_KEY;
+import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_JOURNALNODE_KERBEROS_PRINCIPAL_KEY;
+import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_JOURNALNODE_KEYTAB_FILE_KEY;
+import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_HTTPS_ADDRESS_KEY;
+import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_KERBEROS_PRINCIPAL_KEY;
+import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_KEYTAB_FILE_KEY;
+import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_SERVER_HTTPS_KEYSTORE_RESOURCE_KEY;
+import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_WEB_AUTHENTICATION_KERBEROS_PRINCIPAL_KEY;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertTrue;
+
+/**
+ * Class for Kerberized test cases for {@link DFSInotifyEventInputStream}.
+ */
+public class TestDFSInotifyEventInputStreamKerberized {
+
+  private static final Logger LOG =
+      LoggerFactory.getLogger(TestDFSInotifyEventInputStreamKerberized.class);
+
+  private File baseDir;
+  private String keystoresDir;
+  private String sslConfDir;
+
+  private MiniKdc kdc;
+  private Configuration baseConf;
+  private Configuration conf;
+  private MiniQJMHACluster cluster;
+  private File generalHDFSKeytabFile;
+  private File nnKeytabFile;
+
+  @Rule
+  public Timeout timeout = new Timeout(180000);
+
+  @Test
+  public void testWithKerberizedCluster() throws Exception {
+    conf = new HdfsConfiguration(baseConf);
+    // make sure relogin can happen after tgt expiration.
+    conf.setInt(HADOOP_KERBEROS_MIN_SECONDS_BEFORE_RELOGIN, 3);
+    // make sure the rpc connection is not reused
+    conf.setInt(IPC_CLIENT_CONNECTION_IDLESCANINTERVAL_KEY, 100);
+    conf.setInt(IPC_CLIENT_CONNECTION_MAXIDLETIME_KEY, 2000);
+    Client.setConnectTimeout(conf, 2000);
+    // force the remote journal to be the only edits dir, so we can test
+    // EditLogFileInputStream$URLLog behavior.
+    cluster = new MiniQJMHACluster.Builder(conf).setForceRemoteEditsOnly(true)
+        .build();
+    cluster.getDfsCluster().waitActive();
+    cluster.getDfsCluster().transitionToActive(0);
+
+    final UserGroupInformation ugi = UserGroupInformation
+        .loginUserFromKeytabAndReturnUGI("hdfs",
+            generalHDFSKeytabFile.getAbsolutePath());
+
+    UserGroupInformation.setShouldRenewImmediatelyForTests(true);
+    ugi.doAs(new PrivilegedExceptionAction<Void>() {
+      @Override
+      public Void run() throws Exception {
+        LOG.info("Current user is: " + UserGroupInformation.getCurrentUser()
+            + " login user is:" + UserGroupInformation.getLoginUser());
+        Configuration clientConf =
+            new Configuration(cluster.getDfsCluster().getConfiguration(0));
+        try (DistributedFileSystem clientFs =
+            (DistributedFileSystem) FileSystem.get(clientConf)) {
+          clientFs.mkdirs(new Path("/test"));
+          LOG.info("mkdir /test success");
+          final DFSInotifyEventInputStream eis =
+              clientFs.getInotifyEventStream();
+          // verify we can poll
+          EventBatch batch;
+          while ((batch = eis.poll()) != null) {
+            LOG.info("txid: " + batch.getTxid());
+          }
+          assertNull("poll should not return anything", eis.poll());
+
+          Thread.sleep(6000);
+          LOG.info("Slept 6 seconds to make sure the TGT has expired.");
+
+          UserGroupInformation.getCurrentUser()
+              .checkTGTAndReloginFromKeytab();
+          clientFs.mkdirs(new Path("/test1"));
+          LOG.info("mkdir /test1 success");
+
+          // verify we can poll after a tgt expiration interval
+          batch = eis.poll();
+          assertNotNull("poll should return something", batch);
+          assertEquals(1, batch.getEvents().length);
+          assertNull("poll should not return anything", eis.poll());
+          return null;
+        }
+      }
+    });
+  }
+
+  @Before
+  public void initKerberizedCluster() throws Exception {
+    baseDir = new File(System.getProperty("test.build.dir", "target/test-dir"),
+        TestDFSInotifyEventInputStreamKerberized.class.getSimpleName());
+    FileUtil.fullyDelete(baseDir);
+    assertTrue(baseDir.mkdirs());
+
+    final Properties kdcConf = MiniKdc.createConf();
+    kdcConf.setProperty(MiniKdc.MAX_TICKET_LIFETIME, "5");
+    kdcConf.setProperty(MiniKdc.MIN_TICKET_LIFETIME, "5");
+    kdc = new MiniKdc(kdcConf, baseDir);
+    kdc.start();
+
+    baseConf = new HdfsConfiguration();
+    SecurityUtil.setAuthenticationMethod(
+        UserGroupInformation.AuthenticationMethod.KERBEROS, baseConf);
+    UserGroupInformation.setConfiguration(baseConf);
+    assertTrue("Expected configuration to enable security",
+        UserGroupInformation.isSecurityEnabled());
+
+    final String userName = "hdfs";
+    nnKeytabFile = new File(baseDir, userName + ".keytab");
+    final String keytab = nnKeytabFile.getAbsolutePath();
+    // Windows will not reverse name lookup "127.0.0.1" to "localhost".
+    final String krbInstance = Path.WINDOWS ? "127.0.0.1" : "localhost";
+    kdc.createPrincipal(nnKeytabFile, userName + "/" + krbInstance,
+        "HTTP/" + krbInstance);
+    generalHDFSKeytabFile = new File(baseDir, "hdfs_general.keytab");
+    kdc.createPrincipal(generalHDFSKeytabFile, "hdfs");
+    assertTrue(generalHDFSKeytabFile.exists());
+    final String hdfsPrincipal =
+        userName + "/" + krbInstance + "@" + kdc.getRealm();
+    final String spnegoPrincipal = "HTTP/" + krbInstance + "@" + kdc.getRealm();
+
+    baseConf.set(DFS_NAMENODE_KERBEROS_PRINCIPAL_KEY, hdfsPrincipal);
+    baseConf.set(DFS_NAMENODE_KEYTAB_FILE_KEY, keytab);
+    baseConf.set(DFS_DATANODE_KERBEROS_PRINCIPAL_KEY, hdfsPrincipal);
+    baseConf.set(DFS_DATANODE_KEYTAB_FILE_KEY, keytab);
+    baseConf
+        .set(DFS_WEB_AUTHENTICATION_KERBEROS_PRINCIPAL_KEY, spnegoPrincipal);
+    baseConf.set(DFS_JOURNALNODE_KEYTAB_FILE_KEY, keytab);
+    baseConf.set(DFS_JOURNALNODE_KERBEROS_PRINCIPAL_KEY, hdfsPrincipal);
+    baseConf.set(DFS_JOURNALNODE_KERBEROS_INTERNAL_SPNEGO_PRINCIPAL_KEY,
+        spnegoPrincipal);
+    baseConf.setBoolean(DFS_BLOCK_ACCESS_TOKEN_ENABLE_KEY, true);
+    baseConf.set(DFS_HTTP_POLICY_KEY, HttpConfig.Policy.HTTPS_ONLY.name());
+    baseConf.set(DFS_NAMENODE_HTTPS_ADDRESS_KEY, "localhost:0");
+    baseConf.set(DFS_DATANODE_HTTPS_ADDRESS_KEY, "localhost:0");
+    baseConf.set(DFS_JOURNALNODE_HTTPS_ADDRESS_KEY, "localhost:0");
+    baseConf.setInt(IPC_CLIENT_CONNECT_MAX_RETRIES_ON_SASL_KEY, 10);
+
+    keystoresDir = baseDir.getAbsolutePath();
+    sslConfDir = KeyStoreTestUtil.getClasspathDir(TestSecureNNWithQJM.class);
+    KeyStoreTestUtil.setupSSLConfig(keystoresDir, sslConfDir, baseConf, false);
+    baseConf.set(DFS_CLIENT_HTTPS_KEYSTORE_RESOURCE_KEY,
+        KeyStoreTestUtil.getClientSSLConfigFileName());
+    baseConf.set(DFS_SERVER_HTTPS_KEYSTORE_RESOURCE_KEY,
+        KeyStoreTestUtil.getServerSSLConfigFileName());
+  }
+
+  @After
+  public void shutdownCluster() throws Exception {
+    if (cluster != null) {
+      cluster.shutdown();
+    }
+    if (kdc != null) {
+      kdc.stop();
+    }
+    FileUtil.fullyDelete(baseDir);
+    KeyStoreTestUtil.cleanupSSLConfig(keystoresDir, sslConfDir);
+  }
+}

+ 44 - 0
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestEncryptionZones.java

@@ -540,6 +540,50 @@ public class TestEncryptionZones {
     assertZonePresent(null, rootDir.toString());
   }
 
+  @Test
+  public void testEZwithFullyQualifiedPath() throws Exception {
+    /* Test failure of create EZ on a directory that doesn't exist. */
+    final Path zoneParent = new Path("/zones");
+    final Path zone1 = new Path(zoneParent, "zone1");
+    final Path zone1FQP = new Path(cluster.getURI().toString(), zone1);
+    final Path zone2 = new Path(zoneParent, "zone2");
+    final Path zone2FQP = new Path(cluster.getURI().toString(), zone2);
+
+    int numZones = 0;
+    EnumSet<CreateEncryptionZoneFlag> withTrash = EnumSet
+        .of(CreateEncryptionZoneFlag.PROVISION_TRASH);
+
+    // Create EZ with Trash using FQP
+    fsWrapper.mkdir(zone1FQP, FsPermission.getDirDefault(), true);
+    dfsAdmin.createEncryptionZone(zone1FQP, TEST_KEY, withTrash);
+    assertNumZones(++numZones);
+    assertZonePresent(TEST_KEY, zone1.toString());
+    // Check that zone1 contains a .Trash directory
+    final Path zone1Trash = new Path(zone1, fs.TRASH_PREFIX);
+    assertTrue("CreateEncryptionZone with trash enabled should create a " +
+        ".Trash directory in the EZ", fs.exists(zone1Trash));
+
+    // getEncryptionZoneForPath for FQP should return the path component
+    EncryptionZone ezForZone1 = dfsAdmin.getEncryptionZoneForPath(zone1FQP);
+    assertTrue("getEncryptionZoneForPath for fully qualified path should " +
+        "return the path component",
+        ezForZone1.getPath().equals(zone1.toString()));
+
+    // Create EZ without Trash
+    fsWrapper.mkdir(zone2FQP, FsPermission.getDirDefault(), true);
+    dfsAdmin.createEncryptionZone(zone2FQP, TEST_KEY, NO_TRASH);
+    assertNumZones(++numZones);
+    assertZonePresent(TEST_KEY, zone2.toString());
+
+    // Provision Trash on zone2 using FQP
+    dfsAdmin.provisionEncryptionZoneTrash(zone2FQP);
+    EncryptionZone ezForZone2 = dfsAdmin.getEncryptionZoneForPath(zone2FQP);
+    Path ezTrashForZone2 = new Path(ezForZone2.getPath(),
+        FileSystem.TRASH_PREFIX);
+    assertTrue("provisionEZTrash with fully qualified path should create " +
+        "trash directory ", fsWrapper.exists(ezTrashForZone2));
+  }
+
   /**
    * Test listing encryption zones as a non super user.
    */

+ 40 - 10
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestGetBlocks.java

@@ -229,32 +229,48 @@ public class TestGetBlocks {
       NamenodeProtocol namenode = NameNodeProxies.createProxy(CONF,
           DFSUtilClient.getNNUri(addr), NamenodeProtocol.class).getProxy();
 
-      // get blocks of size fileLen from dataNodes[0]
+      // get blocks of size fileLen from dataNodes[0], with minBlockSize as
+      // fileLen
       BlockWithLocations[] locs;
-      locs = namenode.getBlocks(dataNodes[0], fileLen).getBlocks();
-      assertEquals(locs.length, 12);
+
+      // Should return all 13 blocks, as minBlockSize is not passed
+      locs = namenode.getBlocks(dataNodes[0], fileLen, 0)
+          .getBlocks();
+      assertEquals(13, locs.length);
+      assertEquals(locs[0].getStorageIDs().length, 2);
+      assertEquals(locs[1].getStorageIDs().length, 2);
+
+      // Should return 12 blocks, as minBlockSize is DEFAULT_BLOCK_SIZE
+      locs = namenode.getBlocks(dataNodes[0], fileLen, DEFAULT_BLOCK_SIZE)
+          .getBlocks();
+      assertEquals(12, locs.length);
       assertEquals(locs[0].getStorageIDs().length, 2);
       assertEquals(locs[1].getStorageIDs().length, 2);
 
       // get blocks of size BlockSize from dataNodes[0]
-      locs = namenode.getBlocks(dataNodes[0], DEFAULT_BLOCK_SIZE).getBlocks();
+      locs = namenode.getBlocks(dataNodes[0], DEFAULT_BLOCK_SIZE,
+          DEFAULT_BLOCK_SIZE).getBlocks();
       assertEquals(locs.length, 1);
       assertEquals(locs[0].getStorageIDs().length, 2);
 
       // get blocks of size 1 from dataNodes[0]
-      locs = namenode.getBlocks(dataNodes[0], 1).getBlocks();
+      locs = namenode.getBlocks(dataNodes[0], 1, 1).getBlocks();
       assertEquals(locs.length, 1);
       assertEquals(locs[0].getStorageIDs().length, 2);
 
       // get blocks of size 0 from dataNodes[0]
-      getBlocksWithException(namenode, dataNodes[0], 0);
+      getBlocksWithException(namenode, dataNodes[0], 0, 0);
 
       // get blocks of size -1 from dataNodes[0]
-      getBlocksWithException(namenode, dataNodes[0], -1);
+      getBlocksWithException(namenode, dataNodes[0], -1, 0);
+
+      // minBlockSize is -1
+      getBlocksWithException(namenode, dataNodes[0], DEFAULT_BLOCK_SIZE, -1);
 
       // get blocks of size BlockSize from a non-existent datanode
       DatanodeInfo info = DFSTestUtil.getDatanodeInfo("1.2.3.4");
-      getBlocksWithException(namenode, info, 2);
+      getBlocksWithIncorrectDatanodeException(namenode, info, 2, 0);
+
 
       testBlockIterator(cluster);
     } finally {
@@ -263,10 +279,24 @@ public class TestGetBlocks {
   }
 
   private void getBlocksWithException(NamenodeProtocol namenode,
-      DatanodeInfo datanode, long size) throws IOException {
+      DatanodeInfo datanode, long size, long minBlockSize) throws IOException {
+    boolean getException = false;
+    try {
+      namenode.getBlocks(datanode, size, minBlockSize);
+    } catch (RemoteException e) {
+      getException = true;
+      assertTrue(e.getClassName().contains("IllegalArgumentException"));
+    }
+    assertTrue(getException);
+  }
+
+  private void getBlocksWithIncorrectDatanodeException(
+      NamenodeProtocol namenode, DatanodeInfo datanode, long size,
+      long minBlockSize)
+      throws IOException {
     boolean getException = false;
     try {
-      namenode.getBlocks(DFSTestUtil.getLocalDatanodeInfo(), 2);
+      namenode.getBlocks(datanode, size, minBlockSize);
     } catch (RemoteException e) {
       getException = true;
       assertTrue(e.getClassName().contains("HadoopIllegalArgumentException"));

+ 16 - 5
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/qjournal/MiniQJMHACluster.java

@@ -48,6 +48,7 @@ public class MiniQJMHACluster {
     private StartupOption startOpt = null;
     private int numNNs = 2;
     private final MiniDFSCluster.Builder dfsBuilder;
+    private boolean forceRemoteEditsOnly = false;
 
     public Builder(Configuration conf) {
       this.conf = conf;
@@ -72,6 +73,11 @@ public class MiniQJMHACluster {
       this.numNNs = nns;
       return this;
     }
+
+    public Builder setForceRemoteEditsOnly(boolean val) {
+      this.forceRemoteEditsOnly = val;
+      return this;
+    }
   }
 
   public static MiniDFSNNTopology createDefaultTopology(int nns, int startingPort) {
@@ -107,7 +113,7 @@ public class MiniQJMHACluster {
         // start cluster with specified NameNodes
         MiniDFSNNTopology topology = createDefaultTopology(builder.numNNs, basePort);
 
-        initHAConf(journalURI, builder.conf, builder.numNNs, basePort);
+        initHAConf(journalURI, builder, basePort);
 
         // First start up the NNs just to format the namespace. The MinIDFSCluster
         // has no way to just format the NameNodes without also starting them.
@@ -139,14 +145,19 @@ public class MiniQJMHACluster {
     }
   }
 
-  private Configuration initHAConf(URI journalURI, Configuration conf,
-      int numNNs, int basePort) {
+  private Configuration initHAConf(URI journalURI, Builder builder,
+      int basePort) {
     conf.set(DFSConfigKeys.DFS_NAMENODE_SHARED_EDITS_DIR_KEY,
         journalURI.toString());
+    if (builder.forceRemoteEditsOnly) {
+      conf.set(DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_KEY, journalURI.toString());
+      conf.set(DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_REQUIRED_KEY,
+          journalURI.toString());
+    }
 
-    List<String> nns = new ArrayList<String>(numNNs);
+    List<String> nns = new ArrayList<>(builder.numNNs);
     int port = basePort;
-    for (int i = 0; i < numNNs; i++) {
+    for (int i = 0; i < builder.numNNs; i++) {
       nns.add("127.0.0.1:" + port);
       // increment by 2 each time to account for the http port in the config setting
       port += 2;

+ 1 - 1
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/balancer/TestBalancer.java

@@ -2072,7 +2072,7 @@ public class TestBalancer {
         endGetBlocksTime = Time.monotonicNow();
         numGetBlocksCalls++;
         return blk;
-      }}).when(fsnSpy).getBlocks(any(DatanodeID.class), anyLong());
+      }}).when(fsnSpy).getBlocks(any(DatanodeID.class), anyLong(), anyLong());
   }
 
   /**

+ 17 - 4
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/diskbalancer/DiskBalancerTestUtil.java

@@ -40,6 +40,10 @@ import org.apache.hadoop.hdfs.server.diskbalancer.datamodel.DiskBalancerVolume;
 import org.apache.hadoop.hdfs.server.diskbalancer.datamodel.DiskBalancerVolumeSet;
 import org.apache.hadoop.util.Time;
 
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertTrue;
 
 import java.io.IOException;
@@ -51,6 +55,7 @@ import java.util.concurrent.TimeoutException;
  * Helper class to create various cluster configurations at run time.
  */
 public class DiskBalancerTestUtil {
+  static final Logger LOG = LoggerFactory.getLogger(TestDiskBalancer.class);
   public static final long MB = 1024 * 1024L;
   public static final long GB = MB * 1024L;
   public static final long TB = GB * 1024L;
@@ -241,17 +246,25 @@ public class DiskBalancerTestUtil {
    * @return Number of Blocks.
    * @throws IOException
    */
-  public static int getBlockCount(FsVolumeSpi source) throws IOException {
+  public static int getBlockCount(FsVolumeSpi source,
+                                  boolean checkblockPoolCount)
+      throws IOException {
     int count = 0;
     for (String blockPoolID : source.getBlockPoolList()) {
       FsVolumeSpi.BlockIterator sourceIter =
           source.newBlockIterator(blockPoolID, "TestDiskBalancerSource");
+      int blockCount = 0;
       while (!sourceIter.atEnd()) {
         ExtendedBlock block = sourceIter.nextBlock();
         if (block != null) {
-          count++;
+          blockCount++;
         }
       }
+      if (checkblockPoolCount) {
+        LOG.info("Block Pool Id:  {}, blockCount: {}", blockPoolID, blockCount);
+        assertTrue(blockCount > 0);
+      }
+      count += blockCount;
     }
     return count;
   }
@@ -320,10 +333,10 @@ public class DiskBalancerTestUtil {
                dnNode.getFSDataset().getFsVolumeReferences()) {
         source = (FsVolumeImpl) refs.get(0);
         dest = (FsVolumeImpl) refs.get(1);
-        assertTrue(DiskBalancerTestUtil.getBlockCount(source) > 0);
+        assertTrue(DiskBalancerTestUtil.getBlockCount(source, true) > 0);
         DiskBalancerTestUtil.moveAllDataToDestVolume(dnNode.getFSDataset(),
             source, dest);
-        assertTrue(DiskBalancerTestUtil.getBlockCount(source) == 0);
+        assertEquals(0, DiskBalancerTestUtil.getBlockCount(source, false));
       }
     }
 

+ 64 - 8
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/diskbalancer/TestDiskBalancer.java

@@ -28,6 +28,7 @@ import org.apache.hadoop.hdfs.DFSConfigKeys;
 import org.apache.hadoop.hdfs.DFSTestUtil;
 import org.apache.hadoop.hdfs.HdfsConfiguration;
 import org.apache.hadoop.hdfs.MiniDFSCluster;
+import org.apache.hadoop.hdfs.MiniDFSNNTopology;
 import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
 import org.apache.hadoop.hdfs.server.balancer.TestBalancer;
 import org.apache.hadoop.hdfs.server.datanode.DataNode;
@@ -160,6 +161,62 @@ public class TestDiskBalancer {
     }
   }
 
+  @Test
+  public void testDiskBalancerWithFederatedCluster() throws Exception {
+
+    Configuration conf = new HdfsConfiguration();
+    conf.setBoolean(DFSConfigKeys.DFS_DISK_BALANCER_ENABLED, true);
+    final int blockCount = 100;
+    final int blockSize = 1024;
+    final int diskCount = 2;
+    final int dataNodeCount = 1;
+    final int dataNodeIndex = 0;
+    final int sourceDiskIndex = 0;
+    final long cap = blockSize * 3L * blockCount;
+
+    conf.setInt(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, blockSize);
+    conf.setInt(DFSConfigKeys.DFS_BYTES_PER_CHECKSUM_KEY, blockSize);
+
+    final MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf)
+        .nnTopology(MiniDFSNNTopology.simpleFederatedTopology(2))
+        .numDataNodes(dataNodeCount)
+        .storagesPerDatanode(diskCount)
+        .storageCapacities(new long[] {cap, cap})
+        .build();
+    cluster.waitActive();
+
+    DFSTestUtil.setFederatedConfiguration(cluster, conf);
+
+    final String fileName = "/tmp.txt";
+    final Path filePath = new Path(fileName);
+    long fileLen = blockCount * blockSize;
+
+
+    FileSystem fs = cluster.getFileSystem(0);
+    TestBalancer.createFile(cluster, filePath, fileLen, (short) 1,
+        0);
+    DFSTestUtil.waitReplication(fs, filePath, (short) 1);
+
+    fs = cluster.getFileSystem(1);
+    TestBalancer.createFile(cluster, filePath, fileLen, (short) 1,
+        1);
+    DFSTestUtil.waitReplication(fs, filePath, (short) 1);
+
+    try {
+      DataMover dataMover = new DataMover(cluster, dataNodeIndex,
+          sourceDiskIndex, conf, blockSize, blockCount);
+      dataMover.moveDataToSourceDisk();
+      NodePlan plan = dataMover.generatePlan();
+      dataMover.executePlan(plan);
+      dataMover.verifyPlanExectionDone();
+      dataMover.verifyAllVolumesHaveData();
+      dataMover.verifyTolerance(plan, 0, sourceDiskIndex, 10);
+    } finally {
+      cluster.shutdown();
+    }
+
+  }
+
   @Test
   public void testBalanceDataBetweenMultiplePairsOfVolumes()
       throws Exception {
@@ -599,9 +656,9 @@ public class TestDiskBalancer {
       try (FsDatasetSpi.FsVolumeReferences refs =
                node.getFSDataset().getFsVolumeReferences()) {
         for (FsVolumeSpi volume : refs) {
-          assertTrue(DiskBalancerTestUtil.getBlockCount(volume) > 0);
-          LOG.info(refs.toString() + " : Block Count : {}",
-              DiskBalancerTestUtil.getBlockCount(volume));
+          assertTrue(DiskBalancerTestUtil.getBlockCount(volume, true) > 0);
+          LOG.info("{} : Block Count : {}", refs, DiskBalancerTestUtil
+              .getBlockCount(volume, true));
         }
       }
     }
@@ -618,12 +675,11 @@ public class TestDiskBalancer {
       try (FsDatasetSpi.FsVolumeReferences refs =
                node.getFSDataset().getFsVolumeReferences()) {
         volume = (FsVolumeImpl) refs.get(sourceDiskIndex);
-        assertTrue(DiskBalancerTestUtil.getBlockCount(volume) > 0);
+        assertTrue(DiskBalancerTestUtil.getBlockCount(volume, true) > 0);
 
-        assertTrue(
-            (DiskBalancerTestUtil.getBlockCount(volume) *
-                (blockSize + delta)) >=
-                plan.getVolumeSetPlans().get(0).getBytesToMove());
+        assertTrue((DiskBalancerTestUtil.getBlockCount(volume, true) *
+            (blockSize + delta)) >= plan.getVolumeSetPlans().get(0)
+            .getBytesToMove());
       }
     }
   }

+ 2 - 2
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/diskbalancer/TestDiskBalancerRPC.java

@@ -51,7 +51,7 @@ import java.util.Random;
 import static org.apache.hadoop.hdfs.server.datanode.DiskBalancerWorkStatus.Result.NO_PLAN;
 import static org.apache.hadoop.hdfs.server.datanode.DiskBalancerWorkStatus.Result.PLAN_DONE;
 import static org.apache.hadoop.hdfs.server.datanode.DiskBalancerWorkStatus.Result.PLAN_UNDER_PROGRESS;
-import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.assertEquals;
 
 /**
  * Test DiskBalancer RPC.
@@ -265,7 +265,7 @@ public class TestDiskBalancerRPC {
         dest = (FsVolumeImpl) refs.get(1);
         DiskBalancerTestUtil.moveAllDataToDestVolume(dnNode.getFSDataset(),
             source, dest);
-        assertTrue(DiskBalancerTestUtil.getBlockCount(source) == 0);
+        assertEquals(0, DiskBalancerTestUtil.getBlockCount(source, false));
       } finally {
         refs.close();
       }

+ 40 - 0
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/diskbalancer/command/TestDiskBalancerCommand.java

@@ -27,6 +27,7 @@ import static org.apache.hadoop.hdfs.tools.DiskBalancerCLI.OUTFILE;
 import static org.apache.hadoop.hdfs.tools.DiskBalancerCLI.PLAN;
 import static org.apache.hadoop.hdfs.tools.DiskBalancerCLI.QUERY;
 import static org.apache.hadoop.hdfs.tools.DiskBalancerCLI.REPORT;
+import static org.apache.hadoop.hdfs.tools.DiskBalancerCLI.SKIPDATECHECK;
 import static org.hamcrest.CoreMatchers.allOf;
 import static org.hamcrest.CoreMatchers.containsString;
 import static org.hamcrest.CoreMatchers.is;
@@ -276,6 +277,45 @@ public class TestDiskBalancerCommand {
     }
   }
 
+  @Test(timeout = 600000)
+  public void testDiskBalancerForceExecute() throws
+      Exception {
+    final int numDatanodes = 1;
+
+    final Configuration hdfsConf = new HdfsConfiguration();
+    hdfsConf.setBoolean(DFSConfigKeys.DFS_DISK_BALANCER_ENABLED, true);
+    hdfsConf.set(DFSConfigKeys.DFS_DISK_BALANCER_PLAN_VALID_INTERVAL, "0d");
+
+    /* new cluster with imbalanced capacity */
+    final MiniDFSCluster miniCluster = DiskBalancerTestUtil.
+        newImbalancedCluster(
+            hdfsConf,
+            numDatanodes,
+            CAPACITIES,
+            DEFAULT_BLOCK_SIZE,
+            FILE_LEN);
+
+    try {
+      /* get full path of plan */
+      final String planFileFullName = runAndVerifyPlan(miniCluster, hdfsConf);
+
+      /* run execute command */
+      final String cmdLine = String.format(
+          "hdfs diskbalancer -%s %s -%s",
+          EXECUTE,
+          planFileFullName,
+          SKIPDATECHECK);
+
+      // Disk Balancer should execute the plan, as skipDateCheck Option is
+      // specified
+      runCommand(cmdLine, hdfsConf, miniCluster);
+    }  finally{
+      if (miniCluster != null) {
+        miniCluster.shutdown();
+      }
+    }
+  }
+
 
   @Test(timeout = 600000)
   public void testDiskBalancerExecuteOptionPlanValidity() throws Exception {

+ 143 - 0
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/federation/router/TestRouterNamenodeMonitoring.java

@@ -0,0 +1,143 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs.server.federation.router;
+
+import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_HA_NAMENODE_ID_KEY;
+import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMESERVICE_ID;
+import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_ROUTER_MONITOR_NAMENODE;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.util.Collection;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hdfs.server.federation.RouterConfigBuilder;
+import org.apache.hadoop.hdfs.server.federation.RouterDFSCluster.NamenodeContext;
+import org.apache.hadoop.hdfs.server.federation.RouterDFSCluster.RouterContext;
+import org.apache.hadoop.hdfs.server.federation.StateStoreDFSCluster;
+import org.apache.hadoop.hdfs.server.federation.resolver.FederationNamenodeContext;
+import org.apache.hadoop.hdfs.server.federation.resolver.MembershipNamenodeResolver;
+import org.apache.hadoop.util.Time;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+/**
+ * Test namenodes monitor behavior in the Router.
+ */
+public class TestRouterNamenodeMonitoring {
+
+  private static StateStoreDFSCluster cluster;
+  private static RouterContext routerContext;
+  private static MembershipNamenodeResolver resolver;
+
+  private String ns0;
+  private String ns1;
+  private long initializedTime;
+
+  @Before
+  public void setUp() throws Exception {
+    // Build and start a federated cluster with HA enabled
+    cluster = new StateStoreDFSCluster(true, 2);
+    // Enable heartbeat service and local heartbeat
+    Configuration routerConf = new RouterConfigBuilder()
+        .stateStore()
+        .admin()
+        .rpc()
+        .enableLocalHeartbeat(true)
+        .heartbeat()
+        .build();
+
+    // Specify local node (ns0.nn1) to monitor
+    StringBuilder sb = new StringBuilder();
+    ns0 = cluster.getNameservices().get(0);
+    NamenodeContext context = cluster.getNamenodes(ns0).get(1);
+    routerConf.set(DFS_NAMESERVICE_ID, ns0);
+    routerConf.set(DFS_HA_NAMENODE_ID_KEY, context.getNamenodeId());
+
+    // Specify namenodes (ns1.nn0,ns1.nn1) to monitor
+    sb = new StringBuilder();
+    ns1 = cluster.getNameservices().get(1);
+    for (NamenodeContext ctx : cluster.getNamenodes(ns1)) {
+      String suffix = ctx.getConfSuffix();
+      if (sb.length() != 0) {
+        sb.append(",");
+      }
+      sb.append(suffix);
+    }
+    // override with the namenodes: ns1.nn0,ns1.nn1
+    routerConf.set(DFS_ROUTER_MONITOR_NAMENODE, sb.toString());
+
+    cluster.addRouterOverrides(routerConf);
+    cluster.startCluster();
+    cluster.startRouters();
+    cluster.waitClusterUp();
+
+    routerContext = cluster.getRandomRouter();
+    resolver = (MembershipNamenodeResolver) routerContext.getRouter()
+        .getNamenodeResolver();
+    initializedTime = Time.now();
+  }
+
+  @After
+  public void tearDown() {
+    if (cluster != null) {
+      cluster.stopRouter(routerContext);
+      cluster.shutdown();
+      cluster = null;
+    }
+  }
+
+  @Test
+  public void testNamenodeMonitoring() throws Exception {
+    // Set nn0 to active for all nameservices
+    for (String ns : cluster.getNameservices()) {
+      cluster.switchToActive(ns, "nn0");
+      cluster.switchToStandby(ns, "nn1");
+    }
+
+    Collection<NamenodeHeartbeatService> heartbeatServices = routerContext
+        .getRouter().getNamenodeHearbeatServices();
+    // manually trigger the heartbeat
+    for (NamenodeHeartbeatService service : heartbeatServices) {
+      service.periodicInvoke();
+    }
+
+    resolver.loadCache(true);
+    List<? extends FederationNamenodeContext> namespaceInfo0 =
+        resolver.getNamenodesForNameserviceId(ns0);
+    List<? extends FederationNamenodeContext> namespaceInfo1 =
+        resolver.getNamenodesForNameserviceId(ns1);
+
+    // The modified date won't be updated in ns0.nn0 since it isn't
+    // monitored by the Router.
+    assertEquals("nn0", namespaceInfo0.get(1).getNamenodeId());
+    assertTrue(namespaceInfo0.get(1).getDateModified() < initializedTime);
+
+    // other namnodes should be updated as expected
+    assertEquals("nn1", namespaceInfo0.get(0).getNamenodeId());
+    assertTrue(namespaceInfo0.get(0).getDateModified() > initializedTime);
+
+    assertEquals("nn0", namespaceInfo1.get(0).getNamenodeId());
+    assertTrue(namespaceInfo1.get(0).getDateModified() > initializedTime);
+
+    assertEquals("nn1", namespaceInfo1.get(1).getNamenodeId());
+    assertTrue(namespaceInfo1.get(1).getDateModified() > initializedTime);
+  }
+}

+ 110 - 17
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/snapshot/TestDirectoryDiffList.java → hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/snapshot/TestDiffListBySkipList.java

@@ -27,6 +27,7 @@ import org.apache.hadoop.hdfs.server.namenode.INode;
 import org.apache.hadoop.hdfs.server.namenode.INodeDirectory;
 import org.apache.hadoop.hdfs.server.namenode.snapshot.DirectoryWithSnapshotFeature.ChildrenDiff;
 import org.apache.hadoop.hdfs.server.namenode.snapshot.DirectoryWithSnapshotFeature.DirectoryDiff;
+import org.apache.hadoop.hdfs.server.namenode.snapshot.DiffListBySkipList.SkipListNode;
 import org.apache.hadoop.hdfs.util.ReadOnlyList;
 import org.junit.After;
 import org.junit.Assert;
@@ -36,13 +37,16 @@ import org.junit.Test;
 import java.util.Collections;
 import java.util.List;
 import java.util.concurrent.ThreadLocalRandom;
-import java.util.function.IntFunction;
+import java.util.function.ToIntBiFunction;
+import java.util.function.ToIntFunction;
 
 /**
  * This class tests the DirectoryDiffList API's.
  */
-public class TestDirectoryDiffList{
+public class TestDiffListBySkipList {
   static final int NUM_SNAPSHOTS = 100;
+  static final int MAX_LEVEL = 5;
+
   static {
     SnapshotTestHelper.disableLogs();
   }
@@ -71,6 +75,11 @@ public class TestDirectoryDiffList{
     }
   }
 
+  static DiffListBySkipList newDiffListBySkipList() {
+    DirectoryDiffListFactory.init(3, MAX_LEVEL, DiffListBySkipList.LOG);
+    return new DiffListBySkipList(0);
+  }
+
   static void assertList(List<INode> expected, List<INode> computed) {
     Assert.assertEquals(expected.size(), computed.size());
     for (int index = 0; index < expected.size(); index++) {
@@ -78,7 +87,7 @@ public class TestDirectoryDiffList{
     }
   }
 
-  static void verifyChildrenList(DirectoryDiffList skip, INodeDirectory dir) {
+  static void verifyChildrenList(DiffListBySkipList skip, INodeDirectory dir) {
     final int n = skip.size();
     for (int i = 0; i < skip.size(); i++) {
       final List<INode> expected = ReadOnlyList.Util.asList(
@@ -95,7 +104,7 @@ public class TestDirectoryDiffList{
   }
 
   static void verifyChildrenList(
-      DiffList<DirectoryDiff> array, DirectoryDiffList skip,
+      DiffList<DirectoryDiff> array, DiffListBySkipList skip,
       INodeDirectory dir, List<INode> childrenList) {
     final int n = array.size();
     Assert.assertEquals(n, skip.size());
@@ -114,6 +123,8 @@ public class TestDirectoryDiffList{
         }
       }
     }
+
+    assertSkipList(skip);
   }
 
   private static ChildrenDiff getCombined(
@@ -144,13 +155,13 @@ public class TestDirectoryDiffList{
 
   static void testAddLast(int n) throws Exception {
     final Path root = new Path("/testAddLast" + n);
-    DirectoryDiffList.LOG.info("run " + root);
+    DiffListBySkipList.LOG.info("run " + root);
 
-    final DirectoryDiffList skipList = new DirectoryDiffList(0, 3, 5);
+    final DiffListBySkipList skipList = newDiffListBySkipList();
     final DiffList<DirectoryDiff> arrayList = new DiffListByArrayList<>(0);
     INodeDirectory dir = addDiff(n, skipList, arrayList, root);
     // verify that the both the children list obtained from hdfs and
-    // DirectoryDiffList are same
+    // DiffListBySkipList are same
     verifyChildrenList(skipList, dir);
     verifyChildrenList(arrayList, skipList, dir, Collections.emptyList());
   }
@@ -163,7 +174,7 @@ public class TestDirectoryDiffList{
 
   static void testAddFirst(int n) throws Exception {
     final Path root = new Path("/testAddFirst" + n);
-    DirectoryDiffList.LOG.info("run " + root);
+    DiffListBySkipList.LOG.info("run " + root);
 
     hdfs.mkdirs(root);
     for (int i = 1; i < n; i++) {
@@ -180,7 +191,7 @@ public class TestDirectoryDiffList{
     DiffList<DirectoryDiff> diffs = dir.getDiffs().asList();
     List<INode> childrenList = ReadOnlyList.Util.asList(dir.getChildrenList(
         diffs.get(0).getSnapshotId()));
-    final DirectoryDiffList skipList = new DirectoryDiffList(0, 3, 5);
+    final DiffListBySkipList skipList = newDiffListBySkipList();
     final DiffList<DirectoryDiff> arrayList = new DiffListByArrayList<>(0);
     for (int i = diffs.size() - 1; i >= 0; i--) {
       final DirectoryDiff d = diffs.get(i);
@@ -188,7 +199,7 @@ public class TestDirectoryDiffList{
       arrayList.addFirst(d);
     }
     // verify that the both the children list obtained from hdfs and
-    // DirectoryDiffList are same
+    // DiffListBySkipList are same
     verifyChildrenList(skipList, dir);
     verifyChildrenList(arrayList, skipList, dir, childrenList);
   }
@@ -208,6 +219,7 @@ public class TestDirectoryDiffList{
       skipList.addLast(d);
       arrayList.addLast(d);
     }
+    DiffListBySkipList.LOG.info("skipList: " + skipList);
     return dir;
   }
 
@@ -228,19 +240,26 @@ public class TestDirectoryDiffList{
     testRemove("Random", n, i -> ThreadLocalRandom.current().nextInt(n - i));
   }
 
-  static void testRemove(String name, int n, IntFunction<Integer> indexFunction)
+  static void testRemove(String name, int n,
+      ToIntFunction<Integer> indexFunction) throws Exception {
+    testRemove(name, n, (skipList, i) -> indexFunction.applyAsInt(i));
+  }
+
+  static void testRemove(String name, int n,
+      ToIntBiFunction<DiffListBySkipList, Integer> indexFunction)
       throws Exception {
     final Path root = new Path("/testRemove" + name + n);
-    DirectoryDiffList.LOG.info("run " + root);
+    DiffListBySkipList.LOG.info("run " + root);
 
-    final DirectoryDiffList skipList = new DirectoryDiffList(0, 3, 5);
+    final DiffListBySkipList skipList = newDiffListBySkipList();
     final DiffList<DirectoryDiff> arrayList = new DiffListByArrayList<>(0);
     final INodeDirectory dir = addDiff(n, skipList, arrayList, root);
     Assert.assertEquals(n, arrayList.size());
     Assert.assertEquals(n, skipList.size());
 
-    for(int i = 0; i < n; i++) {
-      final int index = indexFunction.apply(i);
+    for (int i = 0; i < n; i++) {
+      DiffListBySkipList.LOG.debug("i={}: {}", i, skipList);
+      final int index = indexFunction.applyAsInt(skipList, i);
       final DirectoryDiff diff = remove(index, skipList, arrayList);
       hdfs.deleteSnapshot(root, "s" + diff.getSnapshotId());
       verifyChildrenList(skipList, dir);
@@ -248,10 +267,58 @@ public class TestDirectoryDiffList{
     }
   }
 
-  static DirectoryDiff remove(int i, DirectoryDiffList skip,
+  @Test
+  public void testRemoveFromLowerLevel() throws Exception {
+    testRemove("FromLowerLevel", NUM_SNAPSHOTS,
+        new ToIntBiFunction<DiffListBySkipList, Integer>() {
+          private int level = 0;
+
+          @Override
+          public int applyAsInt(DiffListBySkipList skipList, Integer integer) {
+            for (; level <= MAX_LEVEL; level++) {
+              final int index = findIndex(skipList, level);
+              if (index != -1) {
+                return index;
+              }
+            }
+            return -1;
+          }
+        });
+  }
+
+  @Test
+  public void testRemoveFromUpperLevel() throws Exception {
+    testRemove("FromUpperLevel", NUM_SNAPSHOTS,
+        new ToIntBiFunction<DiffListBySkipList, Integer>() {
+      private int level = MAX_LEVEL;
+      @Override
+      public int applyAsInt(DiffListBySkipList skipList, Integer integer) {
+        for(; level >= 0; level--) {
+          final int index = findIndex(skipList, level);
+          if (index != -1) {
+            return index;
+          }
+          DiffListBySkipList.LOG.info("change from level " + level);
+        }
+        return -1;
+      }
+    });
+  }
+
+  static int findIndex(DiffListBySkipList skipList, int level) {
+    for (int i = 0; i < skipList.size(); i++) {
+      if (skipList.getSkipListNode(i).level() == level) {
+        return i;
+      }
+    }
+    return -1;
+  }
+
+  static DirectoryDiff remove(int i, DiffListBySkipList skip,
       DiffList<DirectoryDiff> array) {
-    DirectoryDiffList.LOG.info("remove " + i);
     final DirectoryDiff expected = array.remove(i);
+    DiffListBySkipList.LOG
+        .info("remove " + i + ", snapshotId=" + expected.getSnapshotId());
     final DirectoryDiff computed = skip.remove(i);
     assertDirectoryDiff(expected, computed);
     return expected;
@@ -261,4 +328,30 @@ public class TestDirectoryDiffList{
       DirectoryDiff computed) {
     Assert.assertEquals(expected.getSnapshotId(), computed.getSnapshotId());
   }
+
+  static void assertSkipList(DiffListBySkipList skipList) {
+    for(int i = 0; i < skipList.size(); i++) {
+      assertSkipListNode(skipList.getSkipListNode(i));
+    }
+  }
+
+  static void assertSkipListNode(SkipListNode n) {
+    for (int i = 1; i <= n.level(); i++) {
+      final SkipListNode target = n.getSkipNode(i);
+      final ChildrenDiff diff = n.getChildrenDiff(i);
+      if (target == null) {
+        if (diff != null) {
+          throw new AssertionError(
+              "Target is null but children diff is not at i=" + i + n
+                  .appendTo(new StringBuilder(": ")));
+        }
+      } else if (target == n.getSkipNode(i - 1)) {
+        if (diff != n.getChildrenDiff(i - 1)) {
+          throw new AssertionError(
+              "Same target but different children diff at i=" + i + n
+                  .appendTo(new StringBuilder(": ")));
+        }
+      }
+    }
+  }
 }

+ 57 - 0
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/web/TestWebHdfsUrl.java

@@ -20,6 +20,7 @@ package org.apache.hadoop.hdfs.web;
 
 import static org.apache.hadoop.security.UserGroupInformation.AuthenticationMethod.KERBEROS;
 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
 import static org.mockito.Mockito.mock;
 
 import java.io.IOException;
@@ -29,9 +30,15 @@ import java.net.URL;
 import java.util.Arrays;
 
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.LocatedFileStatus;
 import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.RemoteIterator;
+import org.apache.hadoop.fs.WebHdfs;
 import org.apache.hadoop.fs.permission.FsAction;
+import org.apache.hadoop.hdfs.DFSTestUtil;
+import org.apache.hadoop.hdfs.MiniDFSCluster;
 import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenIdentifier;
 import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenSecretManager;
 import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
@@ -357,4 +364,54 @@ public class TestWebHdfsUrl {
     }
     return (WebHdfsFileSystem) FileSystem.get(uri, conf);
   }
+
+  private static final String SPECIAL_CHARACTER_FILENAME =
+          "specialFile ?\"\\()[]_-=&+;,{}#%'`~!@$^*|<>.";
+
+  @Test
+  public void testWebHdfsSpecialCharacterFile() throws Exception {
+    UserGroupInformation ugi =
+            UserGroupInformation.createRemoteUser("test-user");
+    ugi.setAuthenticationMethod(KERBEROS);
+    UserGroupInformation.setLoginUser(ugi);
+
+    final Configuration conf = WebHdfsTestUtil.createConf();
+    final Path dir = new Path("/testWebHdfsSpecialCharacterFile");
+
+    final short numDatanodes = 1;
+    final MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf)
+            .numDataNodes(numDatanodes)
+            .build();
+    try {
+      cluster.waitActive();
+      final FileSystem fs = WebHdfsTestUtil
+              .getWebHdfsFileSystem(conf, WebHdfs.SCHEME);
+
+      //create a file
+      final long length = 1L << 10;
+      final Path file1 = new Path(dir, SPECIAL_CHARACTER_FILENAME);
+
+      DFSTestUtil.createFile(fs, file1, length, numDatanodes, 20120406L);
+
+      //get file status and check that it was written properly.
+      final FileStatus s1 = fs.getFileStatus(file1);
+      assertEquals("Write failed for file " + file1, length, s1.getLen());
+
+      boolean found = false;
+      RemoteIterator<LocatedFileStatus> statusRemoteIterator =
+              fs.listFiles(dir, false);
+      while (statusRemoteIterator.hasNext()) {
+        LocatedFileStatus locatedFileStatus = statusRemoteIterator.next();
+        if (locatedFileStatus.isFile() &&
+                SPECIAL_CHARACTER_FILENAME
+                        .equals(locatedFileStatus.getPath().getName())) {
+          found = true;
+        }
+      }
+      assertFalse("Could not find file with special character", !found);
+    } finally {
+      cluster.shutdown();
+    }
+  }
+
 }

+ 1 - 1
hadoop-mapreduce-project/hadoop-mapreduce-examples/src/main/java/org/apache/hadoop/examples/pi/package.html

@@ -66,7 +66,7 @@ The table on the right are the results computed by distbbp.
     <li>The log files are available
         <a href="https://issues.apache.org/jira/secure/attachment/12408543/1e15log.zip">here</a>.</li>
     <li>The result was posted in
-        <a href="http://yahoohadoop.tumblr.com/post/98338598026/hadoop-computes-the-10-15-1st-bit-of-π">this YDN blog</a>.</li>
+        <a href="http://yahoohadoop.tumblr.com/post/98338598026/hadoop-computes-the-10-15-1st-bit-of-%CF%80">this YDN blog</a>.</li>
 
 </ul></li>
 <li>The second part of Row 15 (<tt>D3611</tt>)

+ 61 - 11
hadoop-project/pom.xml

@@ -49,9 +49,6 @@
     <xerces.jdiff.version>2.11.0</xerces.jdiff.version>
 
     <kafka.version>0.8.2.1</kafka.version>
-    <hbase.version>1.2.6</hbase.version>
-    <hbase-compatible-hadoop.version>2.5.1</hbase-compatible-hadoop.version>
-    <hbase-compatible-guava.version>11.0.2</hbase-compatible-guava.version>
 
     <hadoop.assemblies.version>${project.version}</hadoop.assemblies.version>
     <commons-daemon.version>1.0.13</commons-daemon.version>
@@ -148,6 +145,8 @@
 
     <swagger-annotations-version>1.5.4</swagger-annotations-version>
     <snakeyaml.version>1.16</snakeyaml.version>
+    <hbase.one.version>1.2.6</hbase.one.version>
+    <hbase.two.version>2.0.0-beta-1</hbase.two.version>
   </properties>
 
   <dependencyManagement>
@@ -409,12 +408,6 @@
         <version>${project.version}</version>
       </dependency>
 
-      <dependency>
-        <groupId>org.apache.hadoop</groupId>
-        <artifactId>hadoop-yarn-server-timelineservice-hbase-server</artifactId>
-        <version>${project.version}</version>
-      </dependency>
-
      <dependency>
         <groupId>org.apache.hadoop</groupId>
         <artifactId>hadoop-yarn-applications-distributedshell</artifactId>
@@ -674,7 +667,6 @@
         <artifactId>jsp-api</artifactId>
         <version>2.1</version>
       </dependency>
-
       <dependency>
         <groupId>org.glassfish</groupId>
         <artifactId>javax.servlet</artifactId>
@@ -829,7 +821,7 @@
       <dependency>
         <groupId>commons-io</groupId>
         <artifactId>commons-io</artifactId>
-        <version>2.4</version>
+        <version>2.5</version>
       </dependency>
 
       <dependency>
@@ -1416,6 +1408,11 @@
           <version>3.8.0</version>
           <scope>test</scope>
         </dependency>
+      <dependency>
+        <groupId>org.jruby.jcodings</groupId>
+        <artifactId>jcodings</artifactId>
+        <version>1.0.13</version>
+      </dependency>
     </dependencies>
   </dependencyManagement>
 
@@ -1884,6 +1881,59 @@
         </plugins>
       </build>
     </profile>
+    <!-- The profile for building against HBase 1.2.x
+     This is the default.
+     -->
+    <profile>
+      <id>hbase1</id>
+      <activation>
+        <property>
+          <name>!hbase.profile</name>
+        </property>
+      </activation>
+      <properties>
+        <hbase.version>${hbase.one.version}</hbase.version>
+        <hbase-compatible-hadoop.version>2.5.1</hbase-compatible-hadoop.version>
+        <hbase-compatible-guava.version>12.0.1</hbase-compatible-guava.version>
+        <hbase-server-artifactid>hadoop-yarn-server-timelineservice-hbase-server-1</hbase-server-artifactid>
+      </properties>
+      <dependencyManagement>
+        <dependencies>
+          <dependency>
+            <groupId>org.apache.hadoop</groupId>
+            <artifactId>${hbase-server-artifactid}</artifactId>
+            <version>${project.version}</version>
+          </dependency>
+        </dependencies>
+      </dependencyManagement>
+    </profile>
+    <!-- The profile for building against HBase 2.0.0.
+     Activate using: mvn -Dhbase.profile=2.0
+    -->
+    <profile>
+      <id>hbase2</id>
+      <activation>
+        <property>
+          <name>hbase.profile</name>
+          <value>2.0</value>
+        </property>
+      </activation>
+      <properties>
+        <hbase.version>${hbase.two.version}</hbase.version>
+        <hbase-compatible-hadoop.version>3.0.0</hbase-compatible-hadoop.version>
+        <hbase-compatible-guava.version>11.0.2</hbase-compatible-guava.version>
+        <hbase-server-artifactid>hadoop-yarn-server-timelineservice-hbase-server-2</hbase-server-artifactid>
+      </properties>
+      <dependencyManagement>
+        <dependencies>
+          <dependency>
+            <groupId>org.apache.hadoop</groupId>
+            <artifactId>${hbase-server-artifactid}</artifactId>
+            <version>${project.version}</version>
+          </dependency>
+        </dependencies>
+      </dependencyManagement>
+    </profile>
   </profiles>
 
   <repositories>

+ 2 - 0
hadoop-tools/hadoop-archive-logs/src/test/java/org/apache/hadoop/tools/TestHadoopArchiveLogs.java

@@ -61,6 +61,8 @@ public class TestHadoopArchiveLogs {
     Path rootLogDir = new Path("target", "logs");
     String suffix = "logs";
     Path logDir = new Path(rootLogDir, new Path(USER, suffix));
+    fs.delete(logDir, true);
+    Assert.assertFalse(fs.exists(logDir));
     fs.mkdirs(logDir);
 
     // no files found

+ 36 - 0
hadoop-tools/hadoop-aws/dev-support/findbugs-exclude.xml

@@ -28,4 +28,40 @@
     <Method name="s3Exists" />
     <Bug pattern="RCN_REDUNDANT_NULLCHECK_OF_NONNULL_VALUE" />
   </Match>
+
+  <!--
+    This extends the serializable S3Object, so findbug checks
+    serializability. It is never serialized however, so its
+    warnings are false positives.
+  -->
+  <Match>
+    <Class name="org.apache.hadoop.fs.s3a.InconsistentS3Object" />
+    <Bug pattern="SE_TRANSIENT_FIELD_NOT_RESTORED" />
+  </Match>
+  <Match>
+    <Class name="org.apache.hadoop.fs.s3a.InconsistentS3Object" />
+    <Bug pattern="SE_NO_SERIALVERSIONID" />
+  </Match>
+
+  <!--
+   findbugs gets confused by lambda expressions in synchronized methods
+   and considers references to fields to be unsynchronized.
+   As you can't disable the methods individually, we have to disable
+   them for the entire class.
+    -->
+  <Match>
+    <Class name="org.apache.hadoop.fs.s3a.S3AInputStream"/>
+    <Bug pattern="IS2_INCONSISTENT_SYNC"/>
+  </Match>
+  <!--
+    findbugs reporting RV ignored. Not true.
+    "Return value of S3AReadOpContext.getReadInvoker() ignored,
+    but method has no side effect"
+  -->
+  <Match>
+    <Class name="org.apache.hadoop.fs.s3a.S3AInputStream"/>
+    <Method name="reopen"/>
+    <Bug pattern="RV_RETURN_VALUE_IGNORED_NO_SIDE_EFFECT"/>
+  </Match>
+
 </FindBugsFilter>

+ 163 - 0
hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/FailureInjectionPolicy.java

@@ -0,0 +1,163 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a;
+
+import com.google.common.base.Preconditions;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.conf.Configuration;
+
+import static org.apache.hadoop.fs.s3a.Constants.*;
+
+/**
+ * Simple object which stores current failure injection settings.
+ * "Delaying a key" can mean:
+ *    - Removing it from the S3 client's listings while delay is in effect.
+ *    - Causing input stream reads to fail.
+ *    - Causing the S3 side of getFileStatus(), i.e.
+ *      AmazonS3#getObjectMetadata(), to throw FileNotFound.
+ */
+public class FailureInjectionPolicy {
+  /**
+   * Keys containing this substring will be subject to delayed visibility.
+   */
+  public static final String DEFAULT_DELAY_KEY_SUBSTRING = "DELAY_LISTING_ME";
+
+  /**
+   * How many seconds affected keys will have delayed visibility.
+   * This should probably be a config value.
+   */
+  public static final long DEFAULT_DELAY_KEY_MSEC = 5 * 1000;
+
+  public static final float DEFAULT_DELAY_KEY_PROBABILITY = 1.0f;
+
+  /** Special config value since we can't store empty strings in XML. */
+  public static final String MATCH_ALL_KEYS = "*";
+
+  private static final Logger LOG =
+      LoggerFactory.getLogger(InconsistentAmazonS3Client.class);
+
+  /** Empty string matches all keys. */
+  private String delayKeySubstring;
+
+  /** Probability to delay visibility of a matching key. */
+  private float delayKeyProbability;
+
+  /** Time in milliseconds to delay visibility of newly modified object. */
+  private long delayKeyMsec;
+
+  /**
+   * Probability of throttling a request.
+   */
+  private float throttleProbability;
+
+  /**
+   * limit for failures before operations succeed; if 0 then "no limit".
+   */
+  private int failureLimit = 0;
+
+  public FailureInjectionPolicy(Configuration conf) {
+
+    this.delayKeySubstring = conf.get(FAIL_INJECT_INCONSISTENCY_KEY,
+        DEFAULT_DELAY_KEY_SUBSTRING);
+    // "" is a substring of all strings, use it to match all keys.
+    if (this.delayKeySubstring.equals(MATCH_ALL_KEYS)) {
+      this.delayKeySubstring = "";
+    }
+    this.delayKeyProbability = validProbability(
+        conf.getFloat(FAIL_INJECT_INCONSISTENCY_PROBABILITY,
+            DEFAULT_DELAY_KEY_PROBABILITY));
+    this.delayKeyMsec = conf.getLong(FAIL_INJECT_INCONSISTENCY_MSEC,
+        DEFAULT_DELAY_KEY_MSEC);
+    this.setThrottleProbability(conf.getFloat(FAIL_INJECT_THROTTLE_PROBABILITY,
+        0.0f));
+  }
+
+  public String getDelayKeySubstring() {
+    return delayKeySubstring;
+  }
+
+  public float getDelayKeyProbability() {
+    return delayKeyProbability;
+  }
+
+  public long getDelayKeyMsec() {
+    return delayKeyMsec;
+  }
+
+  public float getThrottleProbability() {
+    return throttleProbability;
+  }
+
+  public int getFailureLimit() {
+    return failureLimit;
+  }
+
+  public void setFailureLimit(int failureLimit) {
+    this.failureLimit = failureLimit;
+  }
+
+  /**
+   * Set the probability of throttling a request.
+   * @param throttleProbability the probability of a request being throttled.
+   */
+  public void setThrottleProbability(float throttleProbability) {
+    this.throttleProbability = validProbability(throttleProbability);
+  }
+
+  public static boolean trueWithProbability(float p) {
+    return Math.random() < p;
+  }
+
+  /**
+   * Should we delay listing visibility for this key?
+   * @param key key which is being put
+   * @return true if we should delay
+   */
+  public boolean shouldDelay(String key) {
+    float p = getDelayKeyProbability();
+    boolean delay = key.contains(getDelayKeySubstring());
+    delay = delay && trueWithProbability(p);
+    LOG.debug("{}, p={} -> {}", key, p, delay);
+    return delay;
+  }
+
+  @Override
+  public String toString() {
+    return String.format("FailureInjectionPolicy:" +
+            " %s msec delay, substring %s, delay probability %s;" +
+            " throttle probability %s" + "; failure limit %d",
+        delayKeyMsec, delayKeySubstring, delayKeyProbability,
+        throttleProbability, failureLimit);
+  }
+
+  /**
+   * Validate a probability option.
+   * @param p probability
+   * @return the probability, if valid
+   * @throws IllegalArgumentException if the probability is out of range.
+   */
+  private static float validProbability(float p) {
+    Preconditions.checkArgument(p >= 0.0f && p <= 1.0f,
+        "Probability out of range 0 to 1 %s", p);
+    return p;
+  }
+
+}

+ 67 - 110
hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/InconsistentAmazonS3Client.java

@@ -38,6 +38,7 @@ import com.amazonaws.services.s3.model.CompleteMultipartUploadResult;
 import com.amazonaws.services.s3.model.DeleteObjectRequest;
 import com.amazonaws.services.s3.model.DeleteObjectsRequest;
 import com.amazonaws.services.s3.model.DeleteObjectsResult;
+import com.amazonaws.services.s3.model.GetObjectRequest;
 import com.amazonaws.services.s3.model.InitiateMultipartUploadRequest;
 import com.amazonaws.services.s3.model.InitiateMultipartUploadResult;
 import com.amazonaws.services.s3.model.ListMultipartUploadsRequest;
@@ -48,6 +49,7 @@ import com.amazonaws.services.s3.model.MultipartUploadListing;
 import com.amazonaws.services.s3.model.ObjectListing;
 import com.amazonaws.services.s3.model.PutObjectRequest;
 import com.amazonaws.services.s3.model.PutObjectResult;
+import com.amazonaws.services.s3.model.S3Object;
 import com.amazonaws.services.s3.model.S3ObjectSummary;
 import com.amazonaws.services.s3.model.UploadPartRequest;
 import com.amazonaws.services.s3.model.UploadPartResult;
@@ -60,8 +62,6 @@ import org.apache.hadoop.classification.InterfaceStability;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
 
-import static org.apache.hadoop.fs.s3a.Constants.*;
-
 /**
  * A wrapper around {@link com.amazonaws.services.s3.AmazonS3} that injects
  * inconsistency and/or errors.  Used for testing S3Guard.
@@ -71,49 +71,16 @@ import static org.apache.hadoop.fs.s3a.Constants.*;
 @InterfaceStability.Unstable
 public class InconsistentAmazonS3Client extends AmazonS3Client {
 
-  /**
-   * Keys containing this substring will be subject to delayed visibility.
-   */
-  public static final String DEFAULT_DELAY_KEY_SUBSTRING = "DELAY_LISTING_ME";
-
-  /**
-   * How many seconds affected keys will be delayed from appearing in listing.
-   * This should probably be a config value.
-   */
-  public static final long DEFAULT_DELAY_KEY_MSEC = 5 * 1000;
-
-  public static final float DEFAULT_DELAY_KEY_PROBABILITY = 1.0f;
-
-  /** Special config value since we can't store empty strings in XML. */
-  public static final String MATCH_ALL_KEYS = "*";
-
   private static final Logger LOG =
       LoggerFactory.getLogger(InconsistentAmazonS3Client.class);
 
-  /** Empty string matches all keys. */
-  private String delayKeySubstring;
-
-  /** Probability to delay visibility of a matching key. */
-  private float delayKeyProbability;
-
-  /** Time in milliseconds to delay visibility of newly modified object. */
-  private long delayKeyMsec;
-
-  /**
-   * Probability of throttling a request.
-   */
-  private float throttleProbability;
+  private FailureInjectionPolicy policy;
 
   /**
    * Counter of failures since last reset.
    */
   private final AtomicLong failureCounter = new AtomicLong(0);
 
-  /**
-   * limit for failures before operations succeed; if 0 then "no limit".
-   */
-  private int failureLimit = 0;
-
   /**
    * Composite of data we need to track about recently deleted objects:
    * when it was deleted (same was with recently put objects) and the object
@@ -150,36 +117,42 @@ public class InconsistentAmazonS3Client extends AmazonS3Client {
   public InconsistentAmazonS3Client(AWSCredentialsProvider credentials,
       ClientConfiguration clientConfiguration, Configuration conf) {
     super(credentials, clientConfiguration);
-    setupConfig(conf);
+    policy = new FailureInjectionPolicy(conf);
   }
 
-  protected void setupConfig(Configuration conf) {
 
-    delayKeySubstring = conf.get(FAIL_INJECT_INCONSISTENCY_KEY,
-        DEFAULT_DELAY_KEY_SUBSTRING);
-    // "" is a substring of all strings, use it to match all keys.
-    if (delayKeySubstring.equals(MATCH_ALL_KEYS)) {
-      delayKeySubstring = "";
-    }
-    delayKeyProbability = validProbability(
-        conf.getFloat(FAIL_INJECT_INCONSISTENCY_PROBABILITY,
-            DEFAULT_DELAY_KEY_PROBABILITY));
-    delayKeyMsec = conf.getLong(FAIL_INJECT_INCONSISTENCY_MSEC,
-        DEFAULT_DELAY_KEY_MSEC);
-    setThrottleProbability(conf.getFloat(FAIL_INJECT_THROTTLE_PROBABILITY,
-        0.0f));
-    LOG.info("{}", this);
+  /**
+   * Clear any accumulated inconsistency state. Used by tests to make paths
+   * visible again.
+   * @param fs S3AFileSystem under test
+   * @throws Exception on failure
+   */
+  public static void clearInconsistency(S3AFileSystem fs) throws Exception {
+    AmazonS3 s3 = fs.getAmazonS3ClientForTesting("s3guard");
+    InconsistentAmazonS3Client ic = InconsistentAmazonS3Client.castFrom(s3);
+    ic.clearInconsistency();
+  }
+
+  /**
+   * A way for tests to patch in a different fault injection policy at runtime.
+   * @param fs filesystem under test
+   *
+   */
+  public static void setFailureInjectionPolicy(S3AFileSystem fs,
+      FailureInjectionPolicy policy) throws Exception {
+    AmazonS3 s3 = fs.getAmazonS3ClientForTesting("s3guard");
+    InconsistentAmazonS3Client ic = InconsistentAmazonS3Client.castFrom(s3);
+    ic.replacePolicy(policy);
+  }
+
+  private void replacePolicy(FailureInjectionPolicy pol) {
+    this.policy = pol;
   }
 
   @Override
   public String toString() {
-    return String.format(
-        "Inconsistent S3 Client with"
-            + " %s msec delay, substring %s, delay probability %s;"
-            + " throttle probability %s"
-            + "; failure limit %d, failure count %d",
-        delayKeyMsec, delayKeySubstring, delayKeyProbability,
-        throttleProbability, failureLimit, failureCounter.get());
+    return String.format("Inconsistent S3 Client: %s; failure count %d",
+        policy, failureCounter.get());
   }
 
   /**
@@ -470,7 +443,7 @@ public class InconsistentAmazonS3Client extends AmazonS3Client {
       return false;
     }
     long currentTime = System.currentTimeMillis();
-    long deadline = enqueueTime + delayKeyMsec;
+    long deadline = enqueueTime + policy.getDelayKeyMsec();
     if (currentTime >= deadline) {
       delayedDeletes.remove(key);
       LOG.debug("no longer delaying {}", key);
@@ -482,7 +455,7 @@ public class InconsistentAmazonS3Client extends AmazonS3Client {
   }
 
   private void registerDeleteObject(String key, String bucket) {
-    if (shouldDelay(key)) {
+    if (policy.shouldDelay(key)) {
       // Record summary so we can add it back for some time post-deletion
       ListObjectsRequest request = new ListObjectsRequest()
               .withBucketName(bucket)
@@ -498,28 +471,11 @@ public class InconsistentAmazonS3Client extends AmazonS3Client {
 
   private void registerPutObject(PutObjectRequest req) {
     String key = req.getKey();
-    if (shouldDelay(key)) {
+    if (policy.shouldDelay(key)) {
       enqueueDelayedPut(key);
     }
   }
 
-  /**
-   * Should we delay listing visibility for this key?
-   * @param key key which is being put
-   * @return true if we should delay
-   */
-  private boolean shouldDelay(String key) {
-    boolean delay = key.contains(delayKeySubstring);
-    delay = delay && trueWithProbability(delayKeyProbability);
-    LOG.debug("{} -> {}", key, delay);
-    return delay;
-  }
-
-
-  private boolean trueWithProbability(float p) {
-    return Math.random() < p;
-  }
-
   /**
    * Record this key as something that should not become visible in
    * listObject replies for a while, to simulate eventual list consistency.
@@ -561,20 +517,8 @@ public class InconsistentAmazonS3Client extends AmazonS3Client {
     return super.listMultipartUploads(listMultipartUploadsRequest);
   }
 
-  public float getDelayKeyProbability() {
-    return delayKeyProbability;
-  }
-
   public long getDelayKeyMsec() {
-    return delayKeyMsec;
-  }
-
-  /**
-   * Get the probability of the request being throttled.
-   * @return a value 0 - 1.0f.
-   */
-  public float getThrottleProbability() {
-    return throttleProbability;
+    return policy.getDelayKeyMsec();
   }
 
   /**
@@ -582,36 +526,28 @@ public class InconsistentAmazonS3Client extends AmazonS3Client {
    * @param throttleProbability the probability of a request being throttled.
    */
   public void setThrottleProbability(float throttleProbability) {
-    this.throttleProbability = validProbability(throttleProbability);
-  }
-
-  /**
-   * Validate a probability option.
-   * @param p probability
-   * @return the probability, if valid
-   * @throws IllegalArgumentException if the probability is out of range.
-   */
-  private float validProbability(float p) {
-    Preconditions.checkArgument(p >= 0.0f && p <= 1.0f,
-        "Probability out of range 0 to 1 %s", p);
-    return p;
+    policy.setThrottleProbability(throttleProbability);
   }
 
   /**
    * Conditionally fail the operation.
+   * @param errorMsg description of failure
+   * @param statusCode http status code for error
    * @throws AmazonClientException if the client chooses to fail
    * the request.
    */
-  private void maybeFail() throws AmazonClientException {
+  private void maybeFail(String errorMsg, int statusCode)
+      throws AmazonClientException {
     // code structure here is to line up for more failures later
     AmazonServiceException ex = null;
-    if (trueWithProbability(throttleProbability)) {
+    if (policy.trueWithProbability(policy.getThrottleProbability())) {
       // throttle the request
-      ex = new AmazonServiceException("throttled"
+      ex = new AmazonServiceException(errorMsg
           + " count = " + (failureCounter.get() + 1), null);
-      ex.setStatusCode(503);
+      ex.setStatusCode(statusCode);
     }
 
+    int failureLimit = policy.getFailureLimit();
     if (ex != null) {
       long count = failureCounter.incrementAndGet();
       if (failureLimit == 0
@@ -621,16 +557,37 @@ public class InconsistentAmazonS3Client extends AmazonS3Client {
     }
   }
 
+  private void maybeFail() {
+    maybeFail("throttled", 503);
+  }
+
   /**
    * Set the limit on failures before all operations pass through.
    * This resets the failure count.
    * @param limit limit; "0" means "no limit"
    */
   public void setFailureLimit(int limit) {
-    this.failureLimit = limit;
+    policy.setFailureLimit(limit);
     failureCounter.set(0);
   }
 
+  @Override
+  public S3Object getObject(GetObjectRequest var1) throws SdkClientException,
+      AmazonServiceException {
+    maybeFail("file not found", 404);
+    S3Object o = super.getObject(var1);
+    LOG.debug("Wrapping in InconsistentS3Object for key {}", var1.getKey());
+    return new InconsistentS3Object(o, policy);
+  }
+
+  @Override
+  public S3Object getObject(String bucketName, String key)
+      throws SdkClientException, AmazonServiceException {
+    S3Object o = super.getObject(bucketName, key);
+    LOG.debug("Wrapping in InconsistentS3Object for key {}", key);
+    return new InconsistentS3Object(o, policy);
+  }
+
   /** Since ObjectListing is immutable, we just override it with wrapper. */
   @SuppressWarnings("serial")
   private static class CustomObjectListing extends ObjectListing {

+ 232 - 0
hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/InconsistentS3Object.java

@@ -0,0 +1,232 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a;
+
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStream;
+
+import com.amazonaws.services.s3.internal.AmazonS3ExceptionBuilder;
+import com.amazonaws.services.s3.model.AmazonS3Exception;
+import com.amazonaws.services.s3.model.ObjectMetadata;
+import com.amazonaws.services.s3.model.S3Object;
+import com.amazonaws.services.s3.model.S3ObjectInputStream;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Wrapper around S3Object so we can do failure injection on
+ * getObjectContent() and S3ObjectInputStream.
+ * See also {@link InconsistentAmazonS3Client}.
+ */
+@SuppressWarnings({"NonSerializableFieldInSerializableClass", "serial"})
+public class InconsistentS3Object extends S3Object {
+
+  // This should be configurable, probably.
+  public static final int MAX_READ_FAILURES = 100;
+
+  private static int readFailureCounter = 0;
+  private transient S3Object wrapped;
+  private transient FailureInjectionPolicy policy;
+  private final static transient Logger LOG = LoggerFactory.getLogger(
+      InconsistentS3Object.class);
+
+  public InconsistentS3Object(S3Object wrapped, FailureInjectionPolicy policy) {
+    this.wrapped = wrapped;
+    this.policy = policy;
+  }
+
+  @Override
+  public S3ObjectInputStream getObjectContent() {
+    return new InconsistentS3InputStream(wrapped.getObjectContent());
+  }
+
+  @Override
+  public String toString() {
+    return "InconsistentS3Object wrapping: " + wrapped.toString();
+  }
+
+  @Override
+  public ObjectMetadata getObjectMetadata() {
+    return wrapped.getObjectMetadata();
+  }
+
+  @Override
+  public void setObjectMetadata(ObjectMetadata metadata) {
+    wrapped.setObjectMetadata(metadata);
+  }
+
+  @Override
+  public void setObjectContent(S3ObjectInputStream objectContent) {
+    wrapped.setObjectContent(objectContent);
+  }
+
+  @Override
+  public void setObjectContent(InputStream objectContent) {
+    wrapped.setObjectContent(objectContent);
+  }
+
+  @Override
+  public String getBucketName() {
+    return wrapped.getBucketName();
+  }
+
+  @Override
+  public void setBucketName(String bucketName) {
+    wrapped.setBucketName(bucketName);
+  }
+
+  @Override
+  public String getKey() {
+    return wrapped.getKey();
+  }
+
+  @Override
+  public void setKey(String key) {
+    wrapped.setKey(key);
+  }
+
+  @Override
+  public String getRedirectLocation() {
+    return wrapped.getRedirectLocation();
+  }
+
+  @Override
+  public void setRedirectLocation(String redirectLocation) {
+    wrapped.setRedirectLocation(redirectLocation);
+  }
+
+  @Override
+  public Integer getTaggingCount() {
+    return wrapped.getTaggingCount();
+  }
+
+  @Override
+  public void setTaggingCount(Integer taggingCount) {
+    wrapped.setTaggingCount(taggingCount);
+  }
+
+  @Override
+  public void close() throws IOException {
+    wrapped.close();
+  }
+
+  @Override
+  public boolean isRequesterCharged() {
+    return wrapped.isRequesterCharged();
+  }
+
+  @Override
+  public void setRequesterCharged(boolean isRequesterCharged) {
+    wrapped.setRequesterCharged(isRequesterCharged);
+  }
+
+  private AmazonS3Exception mockException(String msg, int httpResponse) {
+    AmazonS3ExceptionBuilder builder = new AmazonS3ExceptionBuilder();
+    builder.setErrorMessage(msg);
+    builder.setStatusCode(httpResponse); // this is the important part
+    builder.setErrorCode(String.valueOf(httpResponse));
+    return builder.build();
+  }
+
+  /**
+   * Insert a failiure injection point for a read call.
+   * @throw IOException, as codepath is on InputStream, not other SDK call.
+   */
+  private void readFailpoint(int off, int len) throws IOException {
+    if (shouldInjectFailure(getKey())) {
+      String error = String.format(
+          "read(b, %d, %d) on key %s failed: injecting error %d/%d" +
+              " for test.", off, len, getKey(), readFailureCounter,
+          MAX_READ_FAILURES);
+      throw new FileNotFoundException(error);
+    }
+  }
+
+  /**
+   * Insert a failiure injection point for an InputStream skip() call.
+   * @throw IOException, as codepath is on InputStream, not other SDK call.
+   */
+  private void skipFailpoint(long len) throws IOException {
+    if (shouldInjectFailure(getKey())) {
+      String error = String.format(
+          "skip(%d) on key %s failed: injecting error %d/%d for test.",
+          len, getKey(), readFailureCounter, MAX_READ_FAILURES);
+      throw new FileNotFoundException(error);
+    }
+  }
+
+  private boolean shouldInjectFailure(String key) {
+    if (policy.shouldDelay(key) &&
+        readFailureCounter < MAX_READ_FAILURES) {
+      readFailureCounter++;
+      return true;
+    }
+    return false;
+  }
+
+  /**
+   * Wraps S3ObjectInputStream and implements failure injection.
+   */
+  protected class InconsistentS3InputStream extends S3ObjectInputStream {
+    private S3ObjectInputStream wrapped;
+
+    public InconsistentS3InputStream(S3ObjectInputStream wrapped) {
+      // seems awkward to have the stream wrap itself.
+      super(wrapped, wrapped.getHttpRequest());
+      this.wrapped = wrapped;
+    }
+
+    @Override
+    public void abort() {
+      wrapped.abort();
+    }
+
+    @Override
+    public int available() throws IOException {
+      return wrapped.available();
+    }
+
+    @Override
+    public void close() throws IOException {
+      wrapped.close();
+    }
+
+    @Override
+    public long skip(long n) throws IOException {
+      skipFailpoint(n);
+      return wrapped.skip(n);
+    }
+
+    @Override
+    public int read() throws IOException {
+      LOG.debug("read() for key {}", getKey());
+      readFailpoint(0, 1);
+      return wrapped.read();
+    }
+
+    @Override
+    public int read(byte[] b, int off, int len) throws IOException {
+      LOG.debug("read(b, {}, {}) for key {}", off, len, getKey());
+      readFailpoint(off, len);
+      return wrapped.read(b, off, len);
+    }
+
+  }
+}

+ 3 - 2
hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Invoker.java

@@ -310,6 +310,9 @@ public class Invoker {
     boolean shouldRetry;
     do {
       try {
+        if (retryCount > 0) {
+          LOG.debug("retry #{}", retryCount);
+        }
         // execute the operation, returning if successful
         return operation.execute();
       } catch (IOException | SdkBaseException e) {
@@ -327,8 +330,6 @@ public class Invoker {
             (SdkBaseException)caught);
       }
 
-
-      int attempts = retryCount + 1;
       try {
         // decide action base on operation, invocation count, etc
         retryAction = retryPolicy.shouldRetry(translated, retryCount, 0,

+ 35 - 9
hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java

@@ -166,6 +166,10 @@ public class S3AFileSystem extends FileSystem implements StreamCapabilities {
   // APIs on an uninitialized filesystem.
   private Invoker invoker = new Invoker(RetryPolicies.TRY_ONCE_THEN_FAIL,
       Invoker.LOG_EVENT);
+  // Only used for very specific code paths which behave differently for
+  // S3Guard. Retries FileNotFound, so be careful if you use this.
+  private Invoker s3guardInvoker = new Invoker(RetryPolicies.TRY_ONCE_THEN_FAIL,
+      Invoker.LOG_EVENT);
   private final Retried onRetry = this::operationRetried;
   private String bucket;
   private int maxKeys;
@@ -251,6 +255,8 @@ public class S3AFileSystem extends FileSystem implements StreamCapabilities {
       s3 = ReflectionUtils.newInstance(s3ClientFactoryClass, conf)
           .createS3Client(name);
       invoker = new Invoker(new S3ARetryPolicy(getConf()), onRetry);
+      s3guardInvoker = new Invoker(new S3GuardExistsRetryPolicy(getConf()),
+          onRetry);
       writeHelper = new WriteOperationHelper(this, getConf());
 
       maxKeys = intOption(conf, MAX_PAGING_KEYS, DEFAULT_MAX_PAGING_KEYS, 1);
@@ -697,18 +703,20 @@ public class S3AFileSystem extends FileSystem implements StreamCapabilities {
     }
 
     return new FSDataInputStream(
-        new S3AInputStream(new S3ObjectAttributes(
-          bucket,
-          pathToKey(f),
-          serverSideEncryptionAlgorithm,
-          getServerSideEncryptionKey(bucket, getConf())),
-            fileStatus.getLen(),
-            s3,
+        new S3AInputStream(new S3AReadOpContext(hasMetadataStore(),
+            invoker,
+            s3guardInvoker,
             statistics,
             instrumentation,
+            fileStatus),
+            new S3ObjectAttributes(bucket,
+                pathToKey(f),
+                serverSideEncryptionAlgorithm,
+                getServerSideEncryptionKey(bucket, getConf())),
+            fileStatus.getLen(),
+            s3,
             readAhead,
-            inputPolicy,
-            invoker));
+            inputPolicy));
   }
 
   /**
@@ -1564,6 +1572,7 @@ public class S3AFileSystem extends FileSystem implements StreamCapabilities {
     long len = request.getPartSize();
     incrementPutStartStatistics(len);
     try {
+      setOptionalUploadPartRequestParameters(request);
       UploadPartResult uploadPartResult = s3.uploadPart(request);
       incrementPutCompletedStatistics(true, len);
       return uploadPartResult;
@@ -2555,6 +2564,23 @@ public class S3AFileSystem extends FileSystem implements StreamCapabilities {
     }
   }
 
+  /**
+   * Sets server side encryption parameters to the part upload
+   * request when encryption is enabled.
+   * @param request upload part request
+   */
+  protected void setOptionalUploadPartRequestParameters(
+      UploadPartRequest request) {
+    switch (serverSideEncryptionAlgorithm) {
+    case SSE_C:
+      if (isNotBlank(getServerSideEncryptionKey(bucket, getConf()))) {
+        request.setSSECustomerKey(generateSSECustomerKey());
+      }
+      break;
+    default:
+    }
+  }
+
   /**
    * Initiate a multipart upload from the preconfigured request.
    * Retry policy: none + untranslated.

+ 76 - 46
hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AInputStream.java

@@ -30,7 +30,6 @@ import org.apache.hadoop.classification.InterfaceStability;
 import org.apache.hadoop.fs.CanSetReadahead;
 import org.apache.hadoop.fs.FSExceptionMessages;
 import org.apache.hadoop.fs.FSInputStream;
-import org.apache.hadoop.fs.FileSystem;
 
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -72,10 +71,11 @@ public class S3AInputStream extends FSInputStream implements CanSetReadahead {
    */
   private volatile boolean closed;
   private S3ObjectInputStream wrappedStream;
-  private final FileSystem.Statistics stats;
+  private final S3AReadOpContext context;
   private final AmazonS3 client;
   private final String bucket;
   private final String key;
+  private final String pathStr;
   private final long contentLength;
   private final String uri;
   private static final Logger LOG =
@@ -85,7 +85,6 @@ public class S3AInputStream extends FSInputStream implements CanSetReadahead {
   private String serverSideEncryptionKey;
   private S3AInputPolicy inputPolicy;
   private long readahead = Constants.DEFAULT_READAHEAD_RANGE;
-  private final Invoker invoker;
 
   /**
    * This is the actual position within the object, used by
@@ -108,40 +107,33 @@ public class S3AInputStream extends FSInputStream implements CanSetReadahead {
    * Create the stream.
    * This does not attempt to open it; that is only done on the first
    * actual read() operation.
+   * @param ctx operation context
    * @param s3Attributes object attributes from a HEAD request
    * @param contentLength length of content
    * @param client S3 client to use
-   * @param stats statistics to update
-   * @param instrumentation instrumentation to update
    * @param readahead readahead bytes
    * @param inputPolicy IO policy
-   * @param invoker preconfigured invoker
    */
-  public S3AInputStream(S3ObjectAttributes s3Attributes,
-      long contentLength,
-      AmazonS3 client,
-      FileSystem.Statistics stats,
-      S3AInstrumentation instrumentation,
-      long readahead,
-      S3AInputPolicy inputPolicy,
-      Invoker invoker) {
+  public S3AInputStream(S3AReadOpContext ctx, S3ObjectAttributes s3Attributes,
+      long contentLength, AmazonS3 client, long readahead,
+      S3AInputPolicy inputPolicy) {
     Preconditions.checkArgument(isNotEmpty(s3Attributes.getBucket()),
         "No Bucket");
     Preconditions.checkArgument(isNotEmpty(s3Attributes.getKey()), "No Key");
     Preconditions.checkArgument(contentLength >= 0, "Negative content length");
+    this.context = ctx;
     this.bucket = s3Attributes.getBucket();
     this.key = s3Attributes.getKey();
+    this.pathStr = ctx.dstFileStatus.getPath().toString();
     this.contentLength = contentLength;
     this.client = client;
-    this.stats = stats;
     this.uri = "s3a://" + this.bucket + "/" + this.key;
-    this.streamStatistics = instrumentation.newInputStreamStatistics();
+    this.streamStatistics = ctx.instrumentation.newInputStreamStatistics();
     this.serverSideEncryptionAlgorithm =
         s3Attributes.getServerSideEncryptionAlgorithm();
     this.serverSideEncryptionKey = s3Attributes.getServerSideEncryptionKey();
     setInputPolicy(inputPolicy);
     setReadahead(readahead);
-    this.invoker = invoker;
   }
 
   /**
@@ -162,6 +154,7 @@ public class S3AInputStream extends FSInputStream implements CanSetReadahead {
    * @param length length requested
    * @throws IOException on any failure to open the object
    */
+  @Retries.OnceTranslated
   private synchronized void reopen(String reason, long targetPos, long length)
       throws IOException {
 
@@ -185,7 +178,7 @@ public class S3AInputStream extends FSInputStream implements CanSetReadahead {
     }
     String text = String.format("Failed to %s %s at %d",
         (opencount == 0 ? "open" : "re-open"), uri, targetPos);
-    S3Object object = invoker.retry(text, uri, true,
+    S3Object object = context.getReadInvoker().once(text, uri,
         () -> client.getObject(request));
     wrappedStream = object.getObjectContent();
     contentRangeStart = targetPos;
@@ -241,6 +234,7 @@ public class S3AInputStream extends FSInputStream implements CanSetReadahead {
    * @param length length of content that needs to be read from targetPos
    * @throws IOException
    */
+  @Retries.OnceTranslated
   private void seekInStream(long targetPos, long length) throws IOException {
     checkNotClosed();
     if (wrappedStream == null) {
@@ -317,14 +311,22 @@ public class S3AInputStream extends FSInputStream implements CanSetReadahead {
    * @param targetPos position from where data should be read
    * @param len length of the content that needs to be read
    */
+  @Retries.RetryTranslated
   private void lazySeek(long targetPos, long len) throws IOException {
-    //For lazy seek
-    seekInStream(targetPos, len);
 
-    //re-open at specific location if needed
-    if (wrappedStream == null) {
-      reopen("read from new offset", targetPos, len);
-    }
+    // With S3Guard, the metadatastore gave us metadata for the file in
+    // open(), so we use a slightly different retry policy.
+    Invoker invoker = context.getReadInvoker();
+    invoker.retry("lazySeek", pathStr, true,
+        () -> {
+          //For lazy seek
+          seekInStream(targetPos, len);
+
+          //re-open at specific location if needed
+          if (wrappedStream == null) {
+            reopen("read from new offset", targetPos, len);
+          }
+        });
   }
 
   /**
@@ -334,29 +336,44 @@ public class S3AInputStream extends FSInputStream implements CanSetReadahead {
    */
   private void incrementBytesRead(long bytesRead) {
     streamStatistics.bytesRead(bytesRead);
-    if (stats != null && bytesRead > 0) {
-      stats.incrementBytesRead(bytesRead);
+    if (context.stats != null && bytesRead > 0) {
+      context.stats.incrementBytesRead(bytesRead);
     }
   }
 
   @Override
+  @Retries.RetryTranslated  // Some retries only happen w/ S3Guard, as intended.
   public synchronized int read() throws IOException {
     checkNotClosed();
     if (this.contentLength == 0 || (nextReadPos >= contentLength)) {
       return -1;
     }
 
-    int byteRead;
     try {
       lazySeek(nextReadPos, 1);
-      byteRead = wrappedStream.read();
     } catch (EOFException e) {
       return -1;
-    } catch (IOException e) {
-      onReadFailure(e, 1);
-      byteRead = wrappedStream.read();
     }
 
+    // With S3Guard, the metadatastore gave us metadata for the file in
+    // open(), so we use a slightly different retry policy.
+    // read() may not be likely to fail, but reopen() does a GET which
+    // certainly could.
+    Invoker invoker = context.getReadInvoker();
+    int byteRead = invoker.retry("read", pathStr, true,
+        () -> {
+          int b;
+          try {
+            b = wrappedStream.read();
+          } catch (EOFException e) {
+            return -1;
+          } catch (IOException e) {
+            onReadFailure(e, 1);
+            b = wrappedStream.read();
+          }
+          return b;
+        });
+
     if (byteRead >= 0) {
       pos++;
       nextReadPos++;
@@ -375,10 +392,11 @@ public class S3AInputStream extends FSInputStream implements CanSetReadahead {
    * @param length length of data being attempted to read
    * @throws IOException any exception thrown on the re-open attempt.
    */
+  @Retries.OnceTranslated
   private void onReadFailure(IOException ioe, int length) throws IOException {
-    LOG.info("Got exception while trying to read from stream {}"
-        + " trying to recover: "+ ioe, uri);
-    LOG.debug("While trying to read from stream {}", uri, ioe);
+
+    LOG.info("Got exception while trying to read from stream {}" +
+        " trying to recover: " + ioe, uri);
     streamStatistics.readException();
     reopen("failure recovery", pos, length);
   }
@@ -392,6 +410,7 @@ public class S3AInputStream extends FSInputStream implements CanSetReadahead {
    * @throws IOException if there are other problems
    */
   @Override
+  @Retries.RetryTranslated  // Some retries only happen w/ S3Guard, as intended.
   public synchronized int read(byte[] buf, int off, int len)
       throws IOException {
     checkNotClosed();
@@ -412,18 +431,27 @@ public class S3AInputStream extends FSInputStream implements CanSetReadahead {
       return -1;
     }
 
-    int bytesRead;
-    try {
-      streamStatistics.readOperationStarted(nextReadPos, len);
-      bytesRead = wrappedStream.read(buf, off, len);
-    } catch (EOFException e) {
-      onReadFailure(e, len);
-      // the base implementation swallows EOFs.
-      return -1;
-    } catch (IOException e) {
-      onReadFailure(e, len);
-      bytesRead = wrappedStream.read(buf, off, len);
-    }
+    // With S3Guard, the metadatastore gave us metadata for the file in
+    // open(), so we use a slightly different retry policy.
+    // read() may not be likely to fail, but reopen() does a GET which
+    // certainly could.
+    Invoker invoker = context.getReadInvoker();
+
+    streamStatistics.readOperationStarted(nextReadPos, len);
+    int bytesRead = invoker.retry("read", pathStr, true,
+        () -> {
+          int bytes;
+          try {
+            bytes = wrappedStream.read(buf, off, len);
+          } catch (EOFException e) {
+            // the base implementation swallows EOFs.
+            return -1;
+          } catch (IOException e) {
+            onReadFailure(e, len);
+            bytes= wrappedStream.read(buf, off, len);
+          }
+          return bytes;
+        });
 
     if (bytesRead > 0) {
       pos += bytesRead;
@@ -481,6 +509,7 @@ public class S3AInputStream extends FSInputStream implements CanSetReadahead {
    * @param length length of the stream.
    * @param forceAbort force an abort; used if explicitly requested.
    */
+  @Retries.OnceRaw
   private void closeStream(String reason, long length, boolean forceAbort) {
     if (wrappedStream != null) {
 
@@ -645,6 +674,7 @@ public class S3AInputStream extends FSInputStream implements CanSetReadahead {
    *
    */
   @Override
+  @Retries.RetryTranslated  // Some retries only happen w/ S3Guard, as intended.
   public void readFully(long position, byte[] buffer, int offset, int length)
       throws IOException {
     checkNotClosed();

+ 87 - 0
hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AOpContext.java

@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a;
+
+import javax.annotation.Nullable;
+
+import com.google.common.base.Preconditions;
+
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+
+/**
+ * Base class for operation context struct passed through codepaths for main
+ * S3AFileSystem operations.
+ * Anything op-specific should be moved to a subclass of this.
+ */
+@SuppressWarnings("visibilitymodifier")  // I want a struct of finals, for real.
+public class S3AOpContext {
+
+  final boolean isS3GuardEnabled;
+  final Invoker invoker;
+  @Nullable final FileSystem.Statistics stats;
+  final S3AInstrumentation instrumentation;
+  @Nullable final Invoker s3guardInvoker;
+
+  /** FileStatus for "destination" path being operated on. */
+  protected final FileStatus dstFileStatus;
+
+  /**
+   * Alternate constructor that allows passing in two invokers, the common
+   * one, and another with the S3Guard Retry Policy.
+   * @param isS3GuardEnabled true if s3Guard is active
+   * @param invoker invoker, which contains retry policy
+   * @param s3guardInvoker s3guard-specific retry policy invoker
+   * @param stats optional stats object
+   * @param instrumentation instrumentation to use
+   * @param dstFileStatus file status from existence check
+   */
+  public S3AOpContext(boolean isS3GuardEnabled, Invoker invoker,
+      Invoker s3guardInvoker, @Nullable FileSystem.Statistics stats,
+      S3AInstrumentation instrumentation, FileStatus dstFileStatus) {
+
+    Preconditions.checkNotNull(invoker, "Null invoker arg");
+    Preconditions.checkNotNull(instrumentation, "Null instrumentation arg");
+    Preconditions.checkNotNull(dstFileStatus, "Null dstFileStatus arg");
+    this.isS3GuardEnabled = isS3GuardEnabled;
+    Preconditions.checkArgument(!isS3GuardEnabled || s3guardInvoker != null,
+        "S3Guard invoker required: S3Guard is enabled.");
+    this.invoker = invoker;
+    this.s3guardInvoker = s3guardInvoker;
+    this.stats = stats;
+    this.instrumentation = instrumentation;
+    this.dstFileStatus = dstFileStatus;
+  }
+
+  /**
+   * Constructor using common invoker and retry policy.
+   * @param isS3GuardEnabled true if s3Guard is active
+   * @param invoker invoker, which contains retry policy
+   * @param stats optional stats object
+   * @param instrumentation instrumentation to use
+   * @param dstFileStatus
+   */
+  public S3AOpContext(boolean isS3GuardEnabled, Invoker invoker,
+      @Nullable FileSystem.Statistics stats, S3AInstrumentation instrumentation,
+      FileStatus dstFileStatus) {
+    this(isS3GuardEnabled, invoker, null, stats, instrumentation,
+        dstFileStatus);
+  }
+
+}

+ 56 - 0
hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AReadOpContext.java

@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a;
+
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+
+import javax.annotation.Nullable;
+
+/**
+ * Read-specific operation context struct.
+ */
+public class S3AReadOpContext extends S3AOpContext {
+  public S3AReadOpContext(boolean isS3GuardEnabled, Invoker invoker,
+      Invoker s3guardInvoker, @Nullable FileSystem.Statistics stats,
+      S3AInstrumentation instrumentation, FileStatus dstFileStatus) {
+    super(isS3GuardEnabled, invoker, s3guardInvoker, stats, instrumentation,
+        dstFileStatus);
+  }
+
+  public S3AReadOpContext(boolean isS3GuardEnabled, Invoker invoker,
+      @Nullable FileSystem.Statistics stats, S3AInstrumentation instrumentation,
+      FileStatus dstFileStatus) {
+    super(isS3GuardEnabled, invoker, stats, instrumentation, dstFileStatus);
+  }
+
+  /**
+   * Get invoker to use for read operations.  When S3Guard is enabled we use
+   * the S3Guard invoker, which deals with things like FileNotFoundException
+   * differently.
+   * @return invoker to use for read codepaths
+   */
+  public Invoker getReadInvoker() {
+    if (isS3GuardEnabled) {
+      return s3guardInvoker;
+    } else {
+      return invoker;
+    }
+  }
+}

+ 36 - 9
hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3ARetryPolicy.java

@@ -76,10 +76,30 @@ import static org.apache.hadoop.fs.s3a.Constants.*;
  * @see <a href="http://docs.aws.amazon.com/AmazonS3/latest/dev/ErrorBestPractices.html">Amazon S3 Error Best Practices</a>
  * @see <a href="http://docs.aws.amazon.com/amazondynamodb/latest/APIReference/CommonErrors.html">Dynamo DB Commmon errors</a>
  */
+@SuppressWarnings("visibilitymodifier")  // I want a struct of finals, for real.
 public class S3ARetryPolicy implements RetryPolicy {
 
+  /** Final retry policy we end up with. */
   private final RetryPolicy retryPolicy;
 
+  // Retry policies for mapping exceptions to
+
+  /** Base policy from configuration. */
+  protected final RetryPolicy fixedRetries;
+
+  /** Rejection of all non-idempotent calls except specific failures. */
+  protected final RetryPolicy retryIdempotentCalls;
+
+  /** Policy for throttle requests, which are considered repeatable, even for
+   * non-idempotent calls, as the service rejected the call entirely. */
+  protected final RetryPolicy throttlePolicy;
+
+  /** No retry on network and tangible API issues. */
+  protected final RetryPolicy fail = RetryPolicies.TRY_ONCE_THEN_FAIL;
+
+  /** Client connectivity: fixed retries without care for idempotency. */
+  protected final RetryPolicy connectivityFailure;
+
   /**
    * Instantiate.
    * @param conf configuration to read.
@@ -88,7 +108,7 @@ public class S3ARetryPolicy implements RetryPolicy {
     Preconditions.checkArgument(conf != null, "Null configuration");
 
     // base policy from configuration
-    RetryPolicy fixedRetries = retryUpToMaximumCountWithFixedSleep(
+    fixedRetries = retryUpToMaximumCountWithFixedSleep(
         conf.getInt(RETRY_LIMIT, RETRY_LIMIT_DEFAULT),
         conf.getTimeDuration(RETRY_INTERVAL,
             RETRY_INTERVAL_DEFAULT,
@@ -97,25 +117,33 @@ public class S3ARetryPolicy implements RetryPolicy {
 
     // which is wrapped by a rejection of all non-idempotent calls except
     // for specific failures.
-    RetryPolicy retryIdempotentCalls = new FailNonIOEs(
+    retryIdempotentCalls = new FailNonIOEs(
         new IdempotencyRetryFilter(fixedRetries));
 
     // and a separate policy for throttle requests, which are considered
     // repeatable, even for non-idempotent calls, as the service
     // rejected the call entirely
-    RetryPolicy throttlePolicy = exponentialBackoffRetry(
+    throttlePolicy = exponentialBackoffRetry(
         conf.getInt(RETRY_THROTTLE_LIMIT, RETRY_THROTTLE_LIMIT_DEFAULT),
         conf.getTimeDuration(RETRY_THROTTLE_INTERVAL,
             RETRY_THROTTLE_INTERVAL_DEFAULT,
             TimeUnit.MILLISECONDS),
         TimeUnit.MILLISECONDS);
 
-    // no retry on network and tangible API issues
-    RetryPolicy fail = RetryPolicies.TRY_ONCE_THEN_FAIL;
-
     // client connectivity: fixed retries without care for idempotency
-    RetryPolicy connectivityFailure = fixedRetries;
+    connectivityFailure = fixedRetries;
 
+    Map<Class<? extends Exception>, RetryPolicy> policyMap =
+        createExceptionMap();
+    retryPolicy = retryByException(retryIdempotentCalls, policyMap);
+  }
+
+  /**
+   * Subclasses can override this like a constructor to change behavior: call
+   * superclass method, then modify it as needed, and return it.
+   * @return Map from exception type to RetryPolicy
+   */
+  protected Map<Class<? extends Exception>, RetryPolicy> createExceptionMap() {
     // the policy map maps the exact classname; subclasses do not
     // inherit policies.
     Map<Class<? extends Exception>, RetryPolicy> policyMap = new HashMap<>();
@@ -126,7 +154,6 @@ public class S3ARetryPolicy implements RetryPolicy {
     policyMap.put(InterruptedException.class, fail);
     // note this does not pick up subclasses (like socket timeout)
     policyMap.put(InterruptedIOException.class, fail);
-    policyMap.put(AWSRedirectException.class, fail);
     // interesting question: should this be retried ever?
     policyMap.put(AccessDeniedException.class, fail);
     policyMap.put(FileNotFoundException.class, fail);
@@ -169,7 +196,7 @@ public class S3ARetryPolicy implements RetryPolicy {
     // trigger sleep
     policyMap.put(ProvisionedThroughputExceededException.class, throttlePolicy);
 
-    retryPolicy = retryByException(retryIdempotentCalls, policyMap);
+    return policyMap;
   }
 
   @Override

+ 47 - 0
hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3GuardExistsRetryPolicy.java

@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a;
+
+import java.io.FileNotFoundException;
+import java.util.Map;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.retry.RetryPolicy;
+
+
+/**
+ * Slightly-modified retry policy for cases when the file is present in the
+ * MetadataStore, but may be still throwing FileNotFoundException from S3.
+ */
+public class S3GuardExistsRetryPolicy extends S3ARetryPolicy {
+  /**
+   * Instantiate.
+   * @param conf configuration to read.
+   */
+  public S3GuardExistsRetryPolicy(Configuration conf) {
+    super(conf);
+  }
+
+  @Override
+  protected Map<Class<? extends Exception>, RetryPolicy> createExceptionMap() {
+    Map<Class<? extends Exception>, RetryPolicy> b = super.createExceptionMap();
+    b.put(FileNotFoundException.class, retryIdempotentCalls);
+    return b;
+  }
+}

+ 14 - 6
hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/DynamoDBMetadataStore.java

@@ -688,9 +688,11 @@ public class DynamoDBMetadataStore implements MetadataStore {
   @Override
   @Retries.OnceRaw
   public void put(Collection<PathMetadata> metas) throws IOException {
-    LOG.debug("Saving batch to table {} in region {}", tableName, region);
 
-    processBatchWriteRequest(null, pathMetadataToItem(completeAncestry(metas)));
+    Item[] items = pathMetadataToItem(completeAncestry(metas));
+    LOG.debug("Saving batch of {} items to table {}, region {}", items.length,
+        tableName, region);
+    processBatchWriteRequest(null, items);
   }
 
   /**
@@ -1076,6 +1078,15 @@ public class DynamoDBMetadataStore implements MetadataStore {
         });
   }
 
+  @Retries.RetryTranslated
+  @VisibleForTesting
+  void provisionTableBlocking(Long readCapacity, Long writeCapacity)
+      throws IOException {
+    provisionTable(readCapacity, writeCapacity);
+    waitForTableActive(table);
+  }
+
+  @VisibleForTesting
   Table getTable() {
     return table;
   }
@@ -1173,15 +1184,12 @@ public class DynamoDBMetadataStore implements MetadataStore {
             S3GUARD_DDB_TABLE_CAPACITY_WRITE_KEY,
             currentWrite);
 
-    ProvisionedThroughput throughput = new ProvisionedThroughput()
-        .withReadCapacityUnits(newRead)
-        .withWriteCapacityUnits(newWrite);
     if (newRead != currentRead || newWrite != currentWrite) {
       LOG.info("Current table capacity is read: {}, write: {}",
           currentRead, currentWrite);
       LOG.info("Changing capacity of table to read: {}, write: {}",
           newRead, newWrite);
-      table.updateTable(throughput);
+      provisionTableBlocking(newRead, newWrite);
     } else {
       LOG.info("Table capacity unchanged at read: {}, write: {}",
           newRead, newWrite);

+ 3 - 0
hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/MetadataStore.java

@@ -34,6 +34,9 @@ import org.apache.hadoop.fs.Path;
  * {@code MetadataStore} defines the set of operations that any metadata store
  * implementation must provide.  Note that all {@link Path} objects provided
  * to methods must be absolute, not relative paths.
+ * Implementations must implement any retries needed internally, such that
+ * transient errors are generally recovered from without throwing exceptions
+ * from this API.
  */
 @InterfaceAudience.Private
 @InterfaceStability.Evolving

+ 5 - 0
hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/testing.md

@@ -300,6 +300,11 @@ By their very nature they are slow. And, as their execution time is often
 limited by bandwidth between the computer running the tests and the S3 endpoint,
 parallel execution does not speed these tests up.
 
+***Note: Running scale tests with -Ds3guard and -Ddynamo requires that
+you use a private, testing-only DynamoDB table.*** The tests do disruptive
+things such as deleting metadata and setting the provisioned throughput
+to very low values.
+
 ### <a name="enabling-scale"></a> Enabling the Scale Tests
 
 The tests are enabled if the `scale` property is set in the maven build

+ 106 - 10
hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AInconsistency.java

@@ -22,16 +22,22 @@ import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.contract.AbstractFSContract;
+import org.apache.hadoop.fs.contract.ContractTestUtils;
 import org.apache.hadoop.fs.contract.s3a.S3AContract;
+import org.apache.hadoop.fs.s3a.s3guard.MetadataStore;
+import org.apache.hadoop.fs.s3a.s3guard.NullMetadataStore;
 import org.apache.hadoop.test.LambdaTestUtils;
 import org.junit.Test;
 
 import java.io.FileNotFoundException;
-import java.util.concurrent.Callable;
+import java.io.InputStream;
 
 import static org.apache.hadoop.fs.contract.ContractTestUtils.touch;
+import static org.apache.hadoop.fs.contract.ContractTestUtils.writeTextFile;
 import static org.apache.hadoop.fs.s3a.Constants.*;
-import static org.apache.hadoop.fs.s3a.InconsistentAmazonS3Client.*;
+import static org.apache.hadoop.fs.s3a.FailureInjectionPolicy.*;
+import static org.apache.hadoop.test.LambdaTestUtils.eventually;
+import static org.apache.hadoop.test.LambdaTestUtils.intercept;
 
 /**
  * Tests S3A behavior under forced inconsistency via {@link
@@ -43,6 +49,8 @@ import static org.apache.hadoop.fs.s3a.InconsistentAmazonS3Client.*;
  */
 public class ITestS3AInconsistency extends AbstractS3ATestBase {
 
+  private static final int OPEN_READ_ITERATIONS = 20;
+
   @Override
   protected AbstractFSContract createContract(Configuration conf) {
     conf.setClass(S3_CLIENT_FACTORY_IMPL, InconsistentS3ClientFactory.class,
@@ -86,15 +94,103 @@ public class ITestS3AInconsistency extends AbstractS3ATestBase {
     }
   }
 
+
+  /**
+   * Ensure that deleting a file with an open read stream does eventually cause
+   * readers to get a FNFE, even with S3Guard and its retries enabled.
+   * In real usage, S3Guard should be enabled for all clients that modify the
+   * file, so the delete would be immediately recorded in the MetadataStore.
+   * Here, however, we test deletion from under S3Guard to make sure it still
+   * eventually propagates the FNFE after any retry policies are exhausted.
+   */
+  @Test
+  public void testOpenDeleteRead() throws Exception {
+    S3AFileSystem fs = getFileSystem();
+    Path p = path("testOpenDeleteRead.txt");
+    writeTextFile(fs, p, "1337c0d3z", true);
+    try (InputStream s = fs.open(p)) {
+      // Disable s3guard, delete file underneath it, re-enable s3guard
+      MetadataStore metadataStore = fs.getMetadataStore();
+      fs.setMetadataStore(new NullMetadataStore());
+      fs.delete(p, false);
+      fs.setMetadataStore(metadataStore);
+      eventually(1000, 200, () -> {
+        intercept(FileNotFoundException.class, () -> s.read());
+      });
+    }
+  }
+
+  /**
+   * Test read() path behavior when getFileStatus() succeeds but subsequent
+   * read() on the input stream fails due to eventual consistency.
+   * There are many points in the InputStream codepaths that can fail. We set
+   * a probability of failure and repeat the test multiple times to achieve
+   * decent coverage.
+   */
+  @Test
+  public void testOpenFailOnRead() throws Exception {
+
+    S3AFileSystem fs = getFileSystem();
+
+    // 1. Patch in a different failure injection policy with <1.0 probability
+    Configuration conf = fs.getConf();
+    conf.setFloat(FAIL_INJECT_INCONSISTENCY_PROBABILITY, 0.5f);
+    InconsistentAmazonS3Client.setFailureInjectionPolicy(fs,
+        new FailureInjectionPolicy(conf));
+
+    // 2. Make sure no ancestor dirs exist
+    Path dir = path("ancestor");
+    fs.delete(dir, true);
+    waitUntilDeleted(dir);
+
+    // 3. Create a descendant file, which implicitly creates ancestors
+    // This file has delayed visibility.
+    describe("creating test file");
+    Path path = path("ancestor/file-to-read-" + DEFAULT_DELAY_KEY_SUBSTRING);
+    writeTextFile(getFileSystem(), path, "Reading is fun", false);
+
+    // 4. Clear inconsistency so the first getFileStatus() can succeed, if we
+    // are not using S3Guard. If we are using S3Guard, it should tolerate the
+    // delayed visibility.
+    if (!fs.hasMetadataStore()) {
+      InconsistentAmazonS3Client.clearInconsistency(fs);
+    }
+
+    // ? Do we need multiple iterations when S3Guard is disabled?  For now,
+    // leaving it in
+    for (int i = 0; i < OPEN_READ_ITERATIONS; i++) {
+      doOpenFailOnReadTest(fs, path, i);
+    }
+  }
+
+  private void doOpenFailOnReadTest(S3AFileSystem fs, Path path, int iteration)
+      throws Exception {
+
+    // 4. Open the file
+    describe(String.format("i=%d: opening test file", iteration));
+    try(InputStream in = fs.open(path)) {
+      // 5. Assert expected behavior on read() failure.
+      int l = 4;
+      byte[] buf = new byte[l];
+      describe("reading test file");
+      // Use both read() variants
+      if ((iteration % 2) == 0) {
+        assertEquals(l, in.read(buf, 0, l));
+      } else {
+        in.read();
+      }
+    } catch (FileNotFoundException e) {
+      if (fs.hasMetadataStore()) {
+        LOG.error("Error:", e);
+        ContractTestUtils.fail("S3Guard failed to handle fail-on-read", e);
+      } else {
+        LOG.info("File not found on read(), as expected.");
+      }
+    }
+  }
+
   private void waitUntilDeleted(final Path p) throws Exception {
     LambdaTestUtils.eventually(30 * 1000, 1000,
-        new Callable<Void>() {
-          @Override
-          public Void call() throws Exception {
-            assertPathDoesNotExist("Dir should be deleted", p);
-            return null;
-          }
-        }
-    );
+        () -> assertPathDoesNotExist("Dir should be deleted", p));
   }
 }

+ 4 - 9
hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3GuardListConsistency.java

@@ -20,7 +20,6 @@ package org.apache.hadoop.fs.s3a;
 
 import com.amazonaws.services.s3.model.ListObjectsV2Request;
 import com.amazonaws.services.s3.model.ListObjectsV2Result;
-import com.amazonaws.services.s3.AmazonS3;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FSDataOutputStream;
 import org.apache.hadoop.fs.FileStatus;
@@ -33,6 +32,7 @@ import org.junit.Assume;
 import org.junit.Test;
 
 import java.io.FileNotFoundException;
+import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.HashSet;
@@ -41,6 +41,7 @@ import java.util.List;
 import static org.apache.hadoop.fs.contract.ContractTestUtils.touch;
 import static org.apache.hadoop.fs.contract.ContractTestUtils.writeTextFile;
 import static org.apache.hadoop.fs.s3a.Constants.*;
+import static org.apache.hadoop.fs.s3a.FailureInjectionPolicy.*;
 import static org.apache.hadoop.fs.s3a.InconsistentAmazonS3Client.*;
 
 /**
@@ -552,11 +553,10 @@ public class ITestS3GuardListConsistency extends AbstractS3ATestBase {
    * @param key
    * @param delimiter
    * @return
-   * @throws IOException
+   * @throws IOException on error
    */
-
   private ListObjectsV2Result listObjectsV2(S3AFileSystem fs,
-      String key, String delimiter) throws java.io.IOException {
+      String key, String delimiter) throws IOException {
     ListObjectsV2Request k = fs.createListObjectsRequest(key, delimiter)
         .getV2();
     return invoker.retryUntranslated("list", true,
@@ -565,9 +565,4 @@ public class ITestS3GuardListConsistency extends AbstractS3ATestBase {
         });
   }
 
-  private static void clearInconsistency(S3AFileSystem fs) throws Exception {
-    AmazonS3 s3 = fs.getAmazonS3ClientForTesting("s3guard");
-    InconsistentAmazonS3Client ic = InconsistentAmazonS3Client.castFrom(s3);
-    ic.clearInconsistency();
-  }
 }

+ 6 - 0
hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/MockS3AFileSystem.java

@@ -24,6 +24,7 @@ import java.net.URI;
 import com.amazonaws.AmazonClientException;
 import com.amazonaws.services.s3.AmazonS3;
 import com.amazonaws.services.s3.model.InitiateMultipartUploadRequest;
+import com.amazonaws.services.s3.model.UploadPartRequest;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -277,6 +278,11 @@ public class MockS3AFileSystem extends S3AFileSystem {
 // no-op
   }
 
+  @Override
+  protected void setOptionalUploadPartRequestParameters(
+      UploadPartRequest request) {
+  }
+
   @Override
   @SuppressWarnings("deprecation")
   public long getDefaultBlockSize() {

+ 2 - 2
hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/S3ATestUtils.java

@@ -51,7 +51,7 @@ import java.util.List;
 import java.util.concurrent.Callable;
 
 import static org.apache.hadoop.fs.contract.ContractTestUtils.skip;
-import static org.apache.hadoop.fs.s3a.InconsistentAmazonS3Client.*;
+import static org.apache.hadoop.fs.s3a.FailureInjectionPolicy.*;
 import static org.apache.hadoop.fs.s3a.S3ATestConstants.*;
 import static org.apache.hadoop.fs.s3a.Constants.*;
 import static org.apache.hadoop.fs.s3a.S3AUtils.propagateBucketOptions;
@@ -819,7 +819,7 @@ public final class S3ATestUtils {
    * Turn on the inconsistent S3A FS client in a configuration,
    * with 100% probability of inconsistency, default delays.
    * For this to go live, the paths must include the element
-   * {@link InconsistentAmazonS3Client#DEFAULT_DELAY_KEY_SUBSTRING}.
+   * {@link FailureInjectionPolicy#DEFAULT_DELAY_KEY_SUBSTRING}.
    * @param conf configuration to patch
    * @param delay delay in millis
    */

+ 2 - 1
hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/AbstractCommitITest.java

@@ -34,6 +34,7 @@ import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.contract.ContractTestUtils;
 import org.apache.hadoop.fs.s3a.AbstractS3ATestBase;
+import org.apache.hadoop.fs.s3a.FailureInjectionPolicy;
 import org.apache.hadoop.fs.s3a.InconsistentAmazonS3Client;
 import org.apache.hadoop.fs.s3a.S3AFileSystem;
 import org.apache.hadoop.fs.s3a.WriteOperationHelper;
@@ -90,7 +91,7 @@ public abstract class AbstractCommitITest extends AbstractS3ATestBase {
   @Override
   protected Path path(String filepath) throws IOException {
     return useInconsistentClient() ?
-           super.path(InconsistentAmazonS3Client.DEFAULT_DELAY_KEY_SUBSTRING
+           super.path(FailureInjectionPolicy.DEFAULT_DELAY_KEY_SUBSTRING
                + "/" + filepath)
            : super.path(filepath);
   }

+ 174 - 0
hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/ITestDynamoDBMetadataStoreScale.java

@@ -0,0 +1,174 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.s3guard;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import javax.annotation.Nullable;
+
+import com.amazonaws.services.dynamodbv2.document.DynamoDB;
+import com.amazonaws.services.dynamodbv2.model.ProvisionedThroughputDescription;
+import org.junit.Test;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.s3a.scale.AbstractITestS3AMetadataStoreScale;
+
+import static org.apache.hadoop.fs.s3a.s3guard.MetadataStoreTestBase.basicFileStatus;
+import static org.apache.hadoop.fs.s3a.Constants.*;
+import static org.junit.Assume.*;
+
+/**
+ * Scale test for DynamoDBMetadataStore.
+ */
+public class ITestDynamoDBMetadataStoreScale
+    extends AbstractITestS3AMetadataStoreScale {
+
+  private static final long BATCH_SIZE = 25;
+  private static final long SMALL_IO_UNITS = BATCH_SIZE / 4;
+
+  @Override
+  public MetadataStore createMetadataStore() throws IOException {
+    Configuration conf = getFileSystem().getConf();
+    String ddbTable = conf.get(S3GUARD_DDB_TABLE_NAME_KEY);
+    assumeNotNull("DynamoDB table is configured", ddbTable);
+    String ddbEndpoint = conf.get(S3GUARD_DDB_REGION_KEY);
+    assumeNotNull("DynamoDB endpoint is configured", ddbEndpoint);
+
+    DynamoDBMetadataStore ms = new DynamoDBMetadataStore();
+    ms.initialize(getFileSystem().getConf());
+    return ms;
+  }
+
+
+  /**
+   * Though the AWS SDK claims in documentation to handle retries and
+   * exponential backoff, we have witnessed
+   * com.amazonaws...dynamodbv2.model.ProvisionedThroughputExceededException
+   * (Status Code: 400; Error Code: ProvisionedThroughputExceededException)
+   * Hypothesis:
+   * Happens when the size of a batched write is bigger than the number of
+   * provisioned write units.  This test ensures we handle the case
+   * correctly, retrying w/ smaller batch instead of surfacing exceptions.
+   */
+  @Test
+  public void testBatchedWriteExceedsProvisioned() throws Exception {
+
+    final long iterations = 5;
+    boolean isProvisionedChanged;
+    List<PathMetadata> toCleanup = new ArrayList<>();
+
+    // Fail if someone changes a constant we depend on
+    assertTrue("Maximum batch size must big enough to run this test",
+        S3GUARD_DDB_BATCH_WRITE_REQUEST_LIMIT >= BATCH_SIZE);
+
+    try (DynamoDBMetadataStore ddbms =
+         (DynamoDBMetadataStore)createMetadataStore()) {
+
+      DynamoDB ddb = ddbms.getDynamoDB();
+      String tableName = ddbms.getTable().getTableName();
+      final ProvisionedThroughputDescription existing =
+          ddb.getTable(tableName).describe().getProvisionedThroughput();
+
+      // If you set the same provisioned I/O as already set it throws an
+      // exception, avoid that.
+      isProvisionedChanged = (existing.getReadCapacityUnits() != SMALL_IO_UNITS
+          || existing.getWriteCapacityUnits() != SMALL_IO_UNITS);
+
+      if (isProvisionedChanged) {
+        // Set low provisioned I/O for dynamodb
+        describe("Provisioning dynamo tbl %s read/write -> %d/%d", tableName,
+            SMALL_IO_UNITS, SMALL_IO_UNITS);
+        // Blocks to ensure table is back to ready state before we proceed
+        ddbms.provisionTableBlocking(SMALL_IO_UNITS, SMALL_IO_UNITS);
+      } else {
+        describe("Skipping provisioning table I/O, already %d/%d",
+            SMALL_IO_UNITS, SMALL_IO_UNITS);
+      }
+
+      try {
+        // We know the dynamodb metadata store will expand a put of a path
+        // of depth N into a batch of N writes (all ancestors are written
+        // separately up to the root).  (Ab)use this for an easy way to write
+        // a batch of stuff that is bigger than the provisioned write units
+        try {
+          describe("Running %d iterations of batched put, size %d", iterations,
+              BATCH_SIZE);
+          long pruneItems = 0;
+          for (long i = 0; i < iterations; i++) {
+            Path longPath = pathOfDepth(BATCH_SIZE, String.valueOf(i));
+            FileStatus status = basicFileStatus(longPath, 0, false, 12345,
+                12345);
+            PathMetadata pm = new PathMetadata(status);
+
+            ddbms.put(pm);
+            toCleanup.add(pm);
+            pruneItems++;
+            // Having hard time reproducing Exceeded exception with put, also
+            // try occasional prune, which was the only stack trace I've seen
+            // (on JIRA)
+            if (pruneItems == BATCH_SIZE) {
+              describe("pruning files");
+              ddbms.prune(Long.MAX_VALUE /* all files */);
+              pruneItems = 0;
+            }
+          }
+        } finally {
+          describe("Cleaning up table %s", tableName);
+          for (PathMetadata pm : toCleanup) {
+            cleanupMetadata(ddbms, pm);
+          }
+        }
+      } finally {
+        if (isProvisionedChanged) {
+          long write = existing.getWriteCapacityUnits();
+          long read = existing.getReadCapacityUnits();
+          describe("Restoring dynamo tbl %s read/write -> %d/%d", tableName,
+              read, write);
+          ddbms.provisionTableBlocking(existing.getReadCapacityUnits(),
+              existing.getWriteCapacityUnits());
+        }
+      }
+    }
+  }
+
+  // Attempt do delete metadata, suppressing any errors
+  private void cleanupMetadata(MetadataStore ms, PathMetadata pm) {
+    try {
+      ms.forgetMetadata(pm.getFileStatus().getPath());
+    } catch (IOException ioe) {
+      // Ignore.
+    }
+  }
+
+  private Path pathOfDepth(long n, @Nullable String fileSuffix) {
+    StringBuilder sb = new StringBuilder();
+    for (long i = 0; i < n; i++) {
+      sb.append(i == 0 ? "/" + this.getClass().getSimpleName() : "lvl");
+      sb.append(i);
+      if (i == n-1 && fileSuffix != null) {
+        sb.append(fileSuffix);
+      }
+      sb.append("/");
+    }
+    return new Path(getFileSystem().getUri().toString(), sb.toString());
+  }
+}

+ 1 - 1
hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/MetadataStoreTestBase.java

@@ -839,7 +839,7 @@ public abstract class MetadataStoreTestBase extends Assert {
     return basicFileStatus(path, size, isDir, modTime, accessTime);
   }
 
-  FileStatus basicFileStatus(Path path, int size, boolean isDir,
+  public static FileStatus basicFileStatus(Path path, int size, boolean isDir,
       long newModTime, long newAccessTime) throws IOException {
     return new FileStatus(size, isDir, REPLICATION, BLOCK_SIZE, newModTime,
         newAccessTime, PERMISSION, OWNER, GROUP, path);

+ 0 - 48
hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/scale/ITestDynamoDBMetadataStoreScale.java

@@ -1,48 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.hadoop.fs.s3a.scale;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.s3a.s3guard.DynamoDBMetadataStore;
-import org.apache.hadoop.fs.s3a.s3guard.MetadataStore;
-
-import java.io.IOException;
-
-import static org.junit.Assume.*;
-import static org.apache.hadoop.fs.s3a.Constants.*;
-
-/**
- * Scale test for DynamoDBMetadataStore.
- */
-public class ITestDynamoDBMetadataStoreScale
-    extends AbstractITestS3AMetadataStoreScale {
-
-  @Override
-  public MetadataStore createMetadataStore() throws IOException {
-    Configuration conf = getFileSystem().getConf();
-    String ddbTable = conf.get(S3GUARD_DDB_TABLE_NAME_KEY);
-    assumeNotNull("DynamoDB table is configured", ddbTable);
-    String ddbEndpoint = conf.get(S3GUARD_DDB_REGION_KEY);
-    assumeNotNull("DynamoDB endpoint is configured", ddbEndpoint);
-
-    DynamoDBMetadataStore ms = new DynamoDBMetadataStore();
-    ms.initialize(getFileSystem().getConf());
-    return ms;
-  }
-}

+ 58 - 0
hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/scale/ITestS3AHugeFilesSSECDiskBlocks.java

@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * <p>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+
+package org.apache.hadoop.fs.s3a.scale;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.s3a.Constants;
+import org.apache.hadoop.fs.s3a.S3AEncryptionMethods;
+import org.apache.hadoop.fs.s3a.S3ATestUtils;
+
+import static org.apache.hadoop.fs.s3a.S3ATestUtils.skipIfEncryptionTestsDisabled;
+
+/**
+ * Concrete class that extends {@link ITestS3AHugeFilesDiskBlocks}
+ * and tests huge files operations with SSE-C encryption enabled.
+ * Skipped if the SSE tests are disabled.
+ */
+public class ITestS3AHugeFilesSSECDiskBlocks
+    extends ITestS3AHugeFilesDiskBlocks {
+
+  private static final String KEY_1
+      = "4niV/jPK5VFRHY+KNb6wtqYd4xXyMgdJ9XQJpcQUVbs=";
+
+  @Override
+  public void setup() throws Exception {
+    super.setup();
+    skipIfEncryptionTestsDisabled(getConfiguration());
+  }
+
+  @Override
+  protected Configuration createScaleConfiguration() {
+    Configuration conf = super.createScaleConfiguration();
+    S3ATestUtils.disableFilesystemCaching(conf);
+    conf.set(Constants.SERVER_SIDE_ENCRYPTION_ALGORITHM,
+        getSSEAlgorithm().getMethod());
+    conf.set(Constants.SERVER_SIDE_ENCRYPTION_KEY, KEY_1);
+    return conf;
+  }
+
+  private S3AEncryptionMethods getSSEAlgorithm() {
+    return S3AEncryptionMethods.SSE_C;
+  }
+}

+ 1 - 1
hadoop-tools/hadoop-openstack/src/test/java/org/apache/hadoop/fs/swift/TestSwiftFileSystemBlockLocation.java

@@ -49,7 +49,7 @@ public class TestSwiftFileSystemBlockLocation extends SwiftFileSystemBaseTest {
 
   private void assertLocationValid(BlockLocation location) throws
                                                            IOException {
-    LOG.info(location);
+    LOG.info("{}", location);
     String[] hosts = location.getHosts();
     String[] names = location.getNames();
     assertNotEqual("No hosts supplied for " + location, 0, hosts.length);

+ 336 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/AllocationTagNamespace.java

@@ -0,0 +1,336 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.api.records;
+
+import com.google.common.base.Strings;
+import com.google.common.collect.ImmutableSet;
+import org.apache.hadoop.yarn.exceptions.InvalidAllocationTagException;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Set;
+
+import static org.apache.hadoop.yarn.api.records.AllocationTagNamespaceType.SELF;
+import static org.apache.hadoop.yarn.api.records.AllocationTagNamespaceType.NOT_SELF;
+import static org.apache.hadoop.yarn.api.records.AllocationTagNamespaceType.APP_LABEL;
+import static org.apache.hadoop.yarn.api.records.AllocationTagNamespaceType.APP_ID;
+import static org.apache.hadoop.yarn.api.records.AllocationTagNamespaceType.ALL;
+import static org.apache.hadoop.yarn.api.records.AllocationTagNamespaceType.fromString;
+
+/**
+ * Class to describe the namespace of an allocation tag.
+ * Each namespace can be evaluated against a set of applications.
+ * After evaluation, the namespace should have an implicit set of
+ * applications which defines its scope.
+ */
+public abstract class AllocationTagNamespace implements
+    Evaluable<TargetApplications> {
+
+  public final static String NAMESPACE_DELIMITER = "/";
+
+  private AllocationTagNamespaceType nsType;
+  // Namespace scope value will be delay binding by eval method.
+  private Set<ApplicationId> nsScope;
+
+  public AllocationTagNamespace(AllocationTagNamespaceType
+      allocationTagNamespaceType) {
+    this.nsType = allocationTagNamespaceType;
+  }
+
+  protected void setScopeIfNotNull(Set<ApplicationId> appIds) {
+    if (appIds != null) {
+      this.nsScope = appIds;
+    }
+  }
+
+  /**
+   * Get the type of the namespace.
+   * @return namespace type.
+   */
+  public AllocationTagNamespaceType getNamespaceType() {
+    return nsType;
+  }
+
+  /**
+   * Get the scope of the namespace, in form of a set of applications.
+   * Before calling this method, {@link #evaluate(TargetApplications)}
+   * must be called in prior to ensure the scope is proper evaluated.
+   *
+   * @return a set of applications.
+   */
+  public Set<ApplicationId> getNamespaceScope() {
+    if (this.nsScope == null) {
+      throw new IllegalStateException("Invalid namespace scope,"
+          + " it is not initialized. Evaluate must be called before"
+          + " a namespace can be consumed.");
+    }
+    return this.nsScope;
+  }
+
+  @Override
+  public abstract void evaluate(TargetApplications target)
+      throws InvalidAllocationTagException;
+
+  /**
+   * @return true if the namespace is effective in all applications
+   * in this cluster. Specifically the namespace prefix should be
+   * "all".
+   */
+  public boolean isGlobal() {
+    return AllocationTagNamespaceType.ALL.equals(getNamespaceType());
+  }
+
+  /**
+   * @return true if the namespace is effective within a single application
+   * by its application ID, the namespace prefix should be "app-id";
+   * false otherwise.
+   */
+  public boolean isSingleInterApp() {
+    return AllocationTagNamespaceType.APP_ID.equals(getNamespaceType());
+  }
+
+  /**
+   * @return true if the namespace is effective to the application itself,
+   * the namespace prefix should be "self"; false otherwise.
+   */
+  public boolean isIntraApp() {
+    return AllocationTagNamespaceType.SELF.equals(getNamespaceType());
+  }
+
+  /**
+   * @return true if the namespace is effective to all applications except
+   * itself, the namespace prefix should be "not-self"; false otherwise.
+   */
+  public boolean isNotSelf() {
+    return AllocationTagNamespaceType.NOT_SELF.equals(getNamespaceType());
+  }
+
+  /**
+   * @return true if the namespace is effective to a group of applications
+   * identified by a application label, the namespace prefix should be
+   * "app-label"; false otherwise.
+   */
+  public boolean isAppLabel() {
+    return AllocationTagNamespaceType.APP_LABEL.equals(getNamespaceType());
+  }
+
+  @Override
+  public String toString() {
+    return this.nsType.toString();
+  }
+
+  /**
+   * Namespace within application itself.
+   */
+  public static class Self extends AllocationTagNamespace {
+
+    public Self() {
+      super(SELF);
+    }
+
+    @Override
+    public void evaluate(TargetApplications target)
+        throws InvalidAllocationTagException {
+      if (target == null || target.getCurrentApplicationId() == null) {
+        throw new InvalidAllocationTagException("Namespace Self must"
+            + " be evaluated against a single application ID.");
+      }
+      ApplicationId applicationId = target.getCurrentApplicationId();
+      setScopeIfNotNull(ImmutableSet.of(applicationId));
+    }
+  }
+
+  /**
+   * Namespace to all applications except itself.
+   */
+  public static class NotSelf extends AllocationTagNamespace {
+
+    private ApplicationId applicationId;
+
+    public NotSelf() {
+      super(NOT_SELF);
+    }
+
+    /**
+     * The scope of self namespace is to an application itself,
+     * the application ID can be delay binding to the namespace.
+     *
+     * @param appId application ID.
+     */
+    public void setApplicationId(ApplicationId appId) {
+      this.applicationId = appId;
+    }
+
+    public ApplicationId getApplicationId() {
+      return this.applicationId;
+    }
+
+    @Override
+    public void evaluate(TargetApplications target) {
+      Set<ApplicationId> otherAppIds = target.getOtherApplicationIds();
+      setScopeIfNotNull(otherAppIds);
+    }
+  }
+
+  /**
+   * Namespace to all applications in the cluster.
+   */
+  public static class All extends AllocationTagNamespace {
+
+    public All() {
+      super(ALL);
+    }
+
+    @Override
+    public void evaluate(TargetApplications target) {
+      Set<ApplicationId> allAppIds = target.getAllApplicationIds();
+      setScopeIfNotNull(allAppIds);
+    }
+  }
+
+  /**
+   * Namespace to all applications in the cluster.
+   */
+  public static class AppLabel extends AllocationTagNamespace {
+
+    public AppLabel() {
+      super(APP_LABEL);
+    }
+
+    @Override
+    public void evaluate(TargetApplications target) {
+      // TODO Implement app-label namespace evaluation
+    }
+  }
+
+  /**
+   * Namespace defined by a certain application ID.
+   */
+  public static class AppID extends AllocationTagNamespace {
+
+    private ApplicationId targetAppId;
+    // app-id namespace requires an extra value of an application id.
+    public AppID(ApplicationId applicationId) {
+      super(APP_ID);
+      this.targetAppId = applicationId;
+    }
+
+    @Override
+    public void evaluate(TargetApplications target) {
+      setScopeIfNotNull(ImmutableSet.of(targetAppId));
+    }
+
+    @Override
+    public String toString() {
+      return APP_ID.toString() + NAMESPACE_DELIMITER + this.targetAppId;
+    }
+  }
+
+  /**
+   * Parse namespace from a string. The string must be in legal format
+   * defined by each {@link AllocationTagNamespaceType}.
+   *
+   * @param namespaceStr namespace string.
+   * @return an instance of {@link AllocationTagNamespace}.
+   * @throws InvalidAllocationTagException
+   * if given string is not in valid format
+   */
+  public static AllocationTagNamespace parse(String namespaceStr)
+      throws InvalidAllocationTagException {
+    // Return the default namespace if no valid string is given.
+    if (Strings.isNullOrEmpty(namespaceStr)) {
+      return new Self();
+    }
+
+    // Normalize the input, escape additional chars.
+    List<String> nsValues = normalize(namespaceStr);
+    // The first string should be the prefix.
+    String nsPrefix = nsValues.get(0);
+    AllocationTagNamespaceType allocationTagNamespaceType =
+        fromString(nsPrefix);
+    switch (allocationTagNamespaceType) {
+    case SELF:
+      return new Self();
+    case NOT_SELF:
+      return new NotSelf();
+    case ALL:
+      return new All();
+    case APP_ID:
+      if (nsValues.size() != 2) {
+        throw new InvalidAllocationTagException(
+            "Missing the application ID in the namespace string: "
+                + namespaceStr);
+      }
+      String appIDStr = nsValues.get(1);
+      return parseAppID(appIDStr);
+    case APP_LABEL:
+      return new AppLabel();
+    default:
+      throw new InvalidAllocationTagException(
+          "Invalid namespace string " + namespaceStr);
+    }
+  }
+
+  private static AllocationTagNamespace parseAppID(String appIDStr)
+      throws InvalidAllocationTagException {
+    try {
+      ApplicationId applicationId = ApplicationId.fromString(appIDStr);
+      return new AppID(applicationId);
+    } catch (IllegalArgumentException e) {
+      throw new InvalidAllocationTagException(
+          "Invalid application ID for "
+              + APP_ID.getTypeKeyword() + ": " + appIDStr);
+    }
+  }
+
+  /**
+   * Valid given namespace string and parse it to a list of sub-strings
+   * that can be consumed by the parser according to the type of the
+   * namespace. Currently the size of return list should be either 1 or 2.
+   * Extra slash is escaped during the normalization.
+   *
+   * @param namespaceStr namespace string.
+   * @return a list of parsed strings.
+   * @throws InvalidAllocationTagException
+   * if namespace format is unexpected.
+   */
+  private static List<String> normalize(String namespaceStr)
+      throws InvalidAllocationTagException {
+    List<String> result = new ArrayList<>();
+    if (namespaceStr == null) {
+      return result;
+    }
+
+    String[] nsValues = namespaceStr.split(NAMESPACE_DELIMITER);
+    for (String str : nsValues) {
+      if (!Strings.isNullOrEmpty(str)) {
+        result.add(str);
+      }
+    }
+
+    // Currently we only allow 1 or 2 values for a namespace string
+    if (result.size() == 0 || result.size() > 2) {
+      throw new InvalidAllocationTagException("Invalid namespace string: "
+          + namespaceStr + ", the syntax is <namespace_prefix> or"
+          + " <namespace_prefix>/<namespace_value>");
+    }
+
+    return result;
+  }
+}

+ 74 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/AllocationTagNamespaceType.java

@@ -0,0 +1,74 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.api.records;
+
+import org.apache.hadoop.yarn.exceptions.InvalidAllocationTagException;
+
+import java.util.Arrays;
+import java.util.Set;
+import java.util.stream.Collectors;
+
+/**
+ * Class to describe all supported forms of namespaces for an allocation tag.
+ */
+public enum AllocationTagNamespaceType {
+
+  SELF("self"),
+  NOT_SELF("not-self"),
+  APP_ID("app-id"),
+  APP_LABEL("app-label"),
+  ALL("all");
+
+  private String typeKeyword;
+  AllocationTagNamespaceType(String keyword) {
+    this.typeKeyword = keyword;
+  }
+
+  public String getTypeKeyword() {
+    return this.typeKeyword;
+  }
+
+  /**
+   * Parses the namespace type from a given string.
+   * @param prefix namespace prefix.
+   * @return namespace type.
+   * @throws InvalidAllocationTagException
+   */
+  public static AllocationTagNamespaceType fromString(String prefix) throws
+      InvalidAllocationTagException {
+    for (AllocationTagNamespaceType type :
+        AllocationTagNamespaceType.values()) {
+      if(type.getTypeKeyword().equals(prefix)) {
+        return type;
+      }
+    }
+
+    Set<String> values = Arrays.stream(AllocationTagNamespaceType.values())
+        .map(AllocationTagNamespaceType::toString)
+        .collect(Collectors.toSet());
+    throw new InvalidAllocationTagException(
+        "Invalid namespace prefix: " + prefix
+            + ", valid values are: " + String.join(",", values));
+  }
+
+  @Override
+  public String toString() {
+    return this.getTypeKeyword();
+  }
+}

+ 50 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/AllocationTags.java

@@ -0,0 +1,50 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.api.records;
+
+import java.util.Set;
+
+/**
+ * Allocation tags under same namespace.
+ */
+public class AllocationTags {
+
+  private AllocationTagNamespace ns;
+  private Set<String> tags;
+
+  public AllocationTags(AllocationTagNamespace namespace,
+      Set<String> allocationTags) {
+    this.ns = namespace;
+    this.tags = allocationTags;
+  }
+
+  /**
+   * @return the namespace of these tags.
+   */
+  public AllocationTagNamespace getNamespace() {
+    return this.ns;
+  }
+
+  /**
+   * @return the allocation tags.
+   */
+  public Set<String> getTags() {
+    return this.tags;
+  }
+}

+ 38 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/Evaluable.java

@@ -0,0 +1,38 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.api.records;
+
+import org.apache.hadoop.yarn.exceptions.YarnException;
+
+/**
+ * A class implements Evaluable interface represents the internal state
+ * of the class can be changed against a given target.
+ * @param <T> a target to evaluate against
+ */
+public interface Evaluable<T> {
+
+  /**
+   * Evaluate against a given target, this process changes the internal state
+   * of current class.
+   *
+   * @param target a generic type target that impacts this evaluation.
+   * @throws YarnException
+   */
+  void evaluate(T target) throws YarnException;
+}

+ 53 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/TargetApplications.java

@@ -0,0 +1,53 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.api.records;
+
+import java.util.Set;
+import java.util.stream.Collectors;
+
+/**
+ * This class is used by
+ * {@link AllocationTagNamespace#evaluate(TargetApplications)} to evaluate
+ * a namespace.
+ */
+public class TargetApplications {
+
+  private ApplicationId currentAppId;
+  private Set<ApplicationId> allAppIds;
+
+  public TargetApplications(ApplicationId currentApplicationId,
+      Set<ApplicationId> allApplicationIds) {
+    this.currentAppId = currentApplicationId;
+    this.allAppIds = allApplicationIds;
+  }
+
+  public Set<ApplicationId> getAllApplicationIds() {
+    return this.allAppIds;
+  }
+
+  public ApplicationId getCurrentApplicationId() {
+    return this.currentAppId;
+  }
+
+  public Set<ApplicationId> getOtherApplicationIds() {
+    return allAppIds == null ? null : allAppIds.stream().filter(appId ->
+        !appId.equals(getCurrentApplicationId()))
+        .collect(Collectors.toSet());
+  }
+}

+ 17 - 9
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/resource/PlacementConstraints.java

@@ -20,9 +20,9 @@ package org.apache.hadoop.yarn.api.resource;
 
 import java.util.concurrent.TimeUnit;
 
-import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.classification.InterfaceAudience.Public;
 import org.apache.hadoop.classification.InterfaceStability.Unstable;
+import org.apache.hadoop.yarn.api.records.AllocationTagNamespace;
 import org.apache.hadoop.yarn.api.resource.PlacementConstraint.AbstractConstraint;
 import org.apache.hadoop.yarn.api.resource.PlacementConstraint.And;
 import org.apache.hadoop.yarn.api.resource.PlacementConstraint.DelayedOr;
@@ -50,13 +50,6 @@ public final class PlacementConstraints {
   public static final String RACK = PlacementConstraint.RACK_SCOPE;
   public static final String NODE_PARTITION = "yarn_node_partition/";
 
-  private static final String APPLICATION_LABEL_PREFIX =
-      "yarn_application_label/";
-
-  @InterfaceAudience.Private
-  public static final String APPLICATION_LABEL_INTRA_APPLICATION =
-      APPLICATION_LABEL_PREFIX + "%intra_app%";
-
   /**
    * Creates a constraint that requires allocations to be placed on nodes that
    * satisfy all target expressions within the given scope (e.g., node or rack).
@@ -223,6 +216,20 @@ public final class PlacementConstraints {
           allocationTags);
     }
 
+    /**
+     * Constructs a target expression on a set of allocation tags under
+     * a certain namespace.
+     *
+     * @param namespace namespace of the allocation tags
+     * @param allocationTags allocation tags
+     * @return a target expression
+     */
+    public static TargetExpression allocationTagWithNamespace(String namespace,
+        String... allocationTags) {
+      return new TargetExpression(TargetType.ALLOCATION_TAG,
+          namespace, allocationTags);
+    }
+
     /**
      * Constructs a target expression on an allocation tag. It is satisfied if
      * there are allocations with one of the given tags. Comparing to
@@ -235,8 +242,9 @@ public final class PlacementConstraints {
      */
     public static TargetExpression allocationTagToIntraApp(
         String... allocationTags) {
+      AllocationTagNamespace selfNs = new AllocationTagNamespace.Self();
       return new TargetExpression(TargetType.ALLOCATION_TAG,
-          APPLICATION_LABEL_INTRA_APPLICATION, allocationTags);
+          selfNs.toString(), allocationTags);
     }
   }
 

+ 34 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/exceptions/InvalidAllocationTagException.java

@@ -0,0 +1,34 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.exceptions;
+
+/**
+ * This exception is thrown by
+ * {@link
+ * org.apache.hadoop.yarn.api.records.AllocationTagNamespace#parse(String)}
+ * when it fails to parse a namespace.
+ */
+public class InvalidAllocationTagException extends YarnException {
+
+  private static final long serialVersionUID = 1L;
+
+  public InvalidAllocationTagException(String message) {
+    super(message);
+  }
+}

+ 2 - 2
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/util/constraint/PlacementConstraintParser.java

@@ -431,10 +431,10 @@ public final class PlacementConstraintParser {
       }
 
       String maxCardinalityStr = resetElements.pop();
-      Integer max = toInt(maxCardinalityStr);
+      int max = toInt(maxCardinalityStr);
 
       String minCardinalityStr = resetElements.pop();
-      Integer min = toInt(minCardinalityStr);
+      int min = toInt(minCardinalityStr);
 
       ArrayList<String> targetTags = new ArrayList<>();
       while (!resetElements.empty()) {

+ 10 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/pom.xml

@@ -249,6 +249,10 @@
             <exclude>src/test/resources/application_1440536969523_0001.har/part-0</exclude>
             <exclude>src/test/resources/application_1440536969523_0001.har/_masterindex</exclude>
             <exclude>src/test/resources/application_1440536969523_0001.har/_SUCCESS</exclude>
+            <exclude>src/test/resources/application_123456_0001.har/_index</exclude>
+            <exclude>src/test/resources/application_123456_0001.har/part-0</exclude>
+            <exclude>src/test/resources/application_123456_0001.har/_masterindex</exclude>
+            <exclude>src/test/resources/application_123456_0001.har/_SUCCESS</exclude>
           </excludes>
         </configuration>
       </plugin>
@@ -371,6 +375,12 @@
               </excludes>
             </configuration>
           </plugin>
+          <plugin>
+            <artifactId>maven-javadoc-plugin</artifactId>
+            <configuration>
+              <excludePackageNames>org.apache.hadoop.yarn.webapp.hamlet</excludePackageNames>
+            </configuration>
+          </plugin>
         </plugins>
       </build>
     </profile>

+ 41 - 19
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/logaggregation/filecontroller/ifile/LogAggregationIndexedFileController.java

@@ -495,16 +495,21 @@ public class LogAggregationIndexedFileController
     boolean getAllContainers = (containerIdStr == null
         || containerIdStr.isEmpty());
     long size = logRequest.getBytes();
-    List<FileStatus> nodeFiles = LogAggregationUtils
-        .getRemoteNodeFileList(conf, appId, logRequest.getAppOwner(),
+    RemoteIterator<FileStatus> nodeFiles = LogAggregationUtils
+        .getRemoteNodeFileDir(conf, appId, logRequest.getAppOwner(),
         this.remoteRootLogDir, this.remoteRootLogDirSuffix);
-    if (nodeFiles.isEmpty()) {
+    if (!nodeFiles.hasNext()) {
       throw new IOException("There is no available log fils for "
           + "application:" + appId);
     }
-    Map<String, Long> checkSumFiles = parseCheckSumFiles(nodeFiles);
+    List<FileStatus> allFiles = getAllNodeFiles(nodeFiles, appId);
+    if (allFiles.isEmpty()) {
+      throw new IOException("There is no available log fils for "
+          + "application:" + appId);
+    }
+    Map<String, Long> checkSumFiles = parseCheckSumFiles(allFiles);
     List<FileStatus> fileToRead = getNodeLogFileToRead(
-        nodeFiles, nodeIdStr, appId);
+        allFiles, nodeIdStr, appId);
     byte[] buf = new byte[65535];
     for (FileStatus thisNodeFile : fileToRead) {
       String nodeName = thisNodeFile.getPath().getName();
@@ -609,16 +614,21 @@ public class LogAggregationIndexedFileController
         containerIdStr.isEmpty());
     String nodeIdStr = (nodeId == null || nodeId.isEmpty()) ? null
         : LogAggregationUtils.getNodeString(nodeId);
-    List<FileStatus> nodeFiles = LogAggregationUtils
-        .getRemoteNodeFileList(conf, appId, appOwner, this.remoteRootLogDir,
+    RemoteIterator<FileStatus> nodeFiles = LogAggregationUtils
+        .getRemoteNodeFileDir(conf, appId, appOwner, this.remoteRootLogDir,
         this.remoteRootLogDirSuffix);
-    if (nodeFiles.isEmpty()) {
+    if (!nodeFiles.hasNext()) {
       throw new IOException("There is no available log fils for "
           + "application:" + appId);
     }
-    Map<String, Long> checkSumFiles = parseCheckSumFiles(nodeFiles);
+    List<FileStatus> allFiles = getAllNodeFiles(nodeFiles, appId);
+    if (allFiles.isEmpty()) {
+      throw new IOException("There is no available log fils for "
+          + "application:" + appId);
+    }
+    Map<String, Long> checkSumFiles = parseCheckSumFiles(allFiles);
     List<FileStatus> fileToRead = getNodeLogFileToRead(
-        nodeFiles, nodeIdStr, appId);
+        allFiles, nodeIdStr, appId);
     for(FileStatus thisNodeFile : fileToRead) {
       try {
         Long checkSumIndex = checkSumFiles.get(
@@ -727,21 +737,33 @@ public class LogAggregationIndexedFileController
       List<FileStatus> nodeFiles, String nodeId, ApplicationId appId)
       throws IOException {
     List<FileStatus> listOfFiles = new ArrayList<>();
-    List<FileStatus> files = new ArrayList<>(nodeFiles);
-    for (FileStatus file : files) {
-      String nodeName = file.getPath().getName();
+    for (FileStatus thisNodeFile : nodeFiles) {
+      String nodeName = thisNodeFile.getPath().getName();
       if ((nodeId == null || nodeId.isEmpty()
           || nodeName.contains(LogAggregationUtils
           .getNodeString(nodeId))) && !nodeName.endsWith(
               LogAggregationUtils.TMP_FILE_SUFFIX) &&
           !nodeName.endsWith(CHECK_SUM_FILE_SUFFIX)) {
-        if (nodeName.equals(appId + ".har")) {
-          Path p = new Path("har:///" + file.getPath().toUri().getRawPath());
-          files = Arrays.asList(HarFs.get(p.toUri(), conf).listStatus(p));
-          continue;
-        }
-        listOfFiles.add(file);
+        listOfFiles.add(thisNodeFile);
+      }
+    }
+    return listOfFiles;
+  }
+
+  private List<FileStatus> getAllNodeFiles(
+      RemoteIterator<FileStatus> nodeFiles, ApplicationId appId)
+      throws IOException {
+    List<FileStatus> listOfFiles = new ArrayList<>();
+    while (nodeFiles != null && nodeFiles.hasNext()) {
+      FileStatus thisNodeFile = nodeFiles.next();
+      String nodeName = thisNodeFile.getPath().getName();
+      if (nodeName.equals(appId + ".har")) {
+        Path p = new Path("har:///"
+            + thisNodeFile.getPath().toUri().getRawPath());
+        nodeFiles = HarFs.get(p.toUri(), conf).listStatusIterator(p);
+        continue;
       }
+      listOfFiles.add(thisNodeFile);
     }
     return listOfFiles;
   }

+ 21 - 1
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/util/Apps.java

@@ -23,6 +23,7 @@ import static org.apache.hadoop.yarn.util.StringHelper.join;
 import static org.apache.hadoop.yarn.util.StringHelper.sjoin;
 
 import java.io.File;
+import java.util.ArrayList;
 import java.util.Iterator;
 import java.util.Map;
 import java.util.regex.Matcher;
@@ -105,7 +106,26 @@ public class Apps {
       }
     }
   }
-  
+
+  /**
+   *
+   * @param envString String containing env variable definitions
+   * @param classPathSeparator String that separates the definitions
+   * @return ArrayList of environment variable names
+   */
+  public static ArrayList<String> getEnvVarsFromInputString(String envString,
+      String classPathSeparator) {
+    ArrayList<String> envList = new ArrayList<>();
+    if (envString != null && envString.length() > 0) {
+      Matcher varValMatcher = VARVAL_SPLITTER.matcher(envString);
+      while (varValMatcher.find()) {
+        String envVar = varValMatcher.group(1);
+        envList.add(envVar);
+      }
+    }
+    return envList;
+  }
+
   /**
    * This older version of this method is kept around for compatibility
    * because downstream frameworks like Spark and Tez have been using it.

+ 1 - 1
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/util/AuxiliaryServiceHelper.java

@@ -45,7 +45,7 @@ public class AuxiliaryServiceHelper {
         Base64.encodeBase64String(byteData));
   }
 
-  private static String getPrefixServiceName(String serviceName) {
+  public static String getPrefixServiceName(String serviceName) {
     return NM_AUX_SERVICE + serviceName;
   }
 }

+ 54 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/test/java/org/apache/hadoop/yarn/logaggregation/filecontroller/ifile/TestLogAggregationIndexFileController.java

@@ -19,6 +19,7 @@
 package org.apache.hadoop.yarn.logaggregation.filecontroller.ifile;
 
 import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.assertNotNull;
 import static org.mockito.Mockito.mock;
 import static org.mockito.Mockito.when;
 import java.io.ByteArrayOutputStream;
@@ -27,6 +28,7 @@ import java.io.FileWriter;
 import java.io.IOException;
 import java.io.PrintStream;
 import java.io.Writer;
+import java.net.URL;
 import java.nio.charset.Charset;
 import java.util.ArrayList;
 import java.util.HashMap;
@@ -364,6 +366,58 @@ public class TestLogAggregationIndexFileController {
     sysOutStream.reset();
   }
 
+  @Test(timeout = 15000)
+  public void testFetchApplictionLogsHar() throws Exception {
+    List<String> newLogTypes = new ArrayList<>();
+    newLogTypes.add("syslog");
+    newLogTypes.add("stdout");
+    newLogTypes.add("stderr");
+    newLogTypes.add("test1");
+    newLogTypes.add("test2");
+    URL harUrl = ClassLoader.getSystemClassLoader()
+        .getResource("application_123456_0001.har");
+    assertNotNull(harUrl);
+
+    Path path = new Path(remoteLogDir + "/" + USER_UGI.getShortUserName()
+        + "/logs/application_123456_0001");
+    if (fs.exists(path)) {
+      fs.delete(path, true);
+    }
+    assertTrue(fs.mkdirs(path));
+    Path harPath = new Path(path, "application_123456_0001.har");
+    fs.copyFromLocalFile(false, new Path(harUrl.toURI()), harPath);
+    assertTrue(fs.exists(harPath));
+    LogAggregationIndexedFileController fileFormat
+        = new LogAggregationIndexedFileController();
+    fileFormat.initialize(conf, "Indexed");
+    ContainerLogsRequest logRequest = new ContainerLogsRequest();
+    logRequest.setAppId(appId);
+    logRequest.setNodeId(nodeId.toString());
+    logRequest.setAppOwner(USER_UGI.getShortUserName());
+    logRequest.setContainerId(containerId.toString());
+    logRequest.setBytes(Long.MAX_VALUE);
+    List<ContainerLogMeta> meta = fileFormat.readAggregatedLogsMeta(
+        logRequest);
+    Assert.assertEquals(meta.size(), 3);
+    List<String> fileNames = new ArrayList<>();
+    for (ContainerLogMeta log : meta) {
+      Assert.assertTrue(log.getContainerId().equals(containerId.toString()));
+      Assert.assertTrue(log.getNodeId().equals(nodeId.toString()));
+      for (ContainerLogFileInfo file : log.getContainerLogMeta()) {
+        fileNames.add(file.getFileName());
+      }
+    }
+    fileNames.removeAll(newLogTypes);
+    Assert.assertTrue(fileNames.isEmpty());
+    boolean foundLogs = fileFormat.readAggregatedLogs(logRequest, System.out);
+    Assert.assertTrue(foundLogs);
+    for (String logType : newLogTypes) {
+      Assert.assertTrue(sysOutStream.toString().contains(logMessage(
+          containerId, logType)));
+    }
+    sysOutStream.reset();
+  }
+
   private File createAndWriteLocalLogFile(ContainerId containerId,
       Path localLogDir, String logType) throws IOException {
     File file = new File(localLogDir.toString(), logType);

+ 0 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/test/resources/application_123456_0001.har/_SUCCESS


+ 3 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/test/resources/application_123456_0001.har/_index

@@ -0,0 +1,3 @@
+%2F dir 1517728311922+493+xuan+supergroup 0 0 localhost_9999_1517727665265 localhost_9999_1517727668513 
+%2Flocalhost_9999_1517727665265 file part-0 0 2895 1517728301581+420+xuan+supergroup 
+%2Flocalhost_9999_1517727668513 file part-0 2895 1228 1517728311919+420+xuan+supergroup 

+ 2 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/test/resources/application_123456_0001.har/_masterindex

@@ -0,0 +1,2 @@
+3 
+0 1897968749 0 280 

BIN
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/test/resources/application_123456_0001.har/part-0


+ 5 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/api/protocolrecords/RegisterNodeManagerRequest.java

@@ -112,4 +112,9 @@ public abstract class RegisterNodeManagerRequest {
    * @param physicalResource Physical resources in the node.
    */
   public abstract void setPhysicalResource(Resource physicalResource);
+
+  public abstract List<LogAggregationReport> getLogAggregationReportsForApps();
+
+  public abstract void setLogAggregationReportsForApps(
+      List<LogAggregationReport> logAggregationReportsForApps);
 }

+ 79 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/api/protocolrecords/impl/pb/RegisterNodeManagerRequestPBImpl.java

@@ -38,11 +38,13 @@ import org.apache.hadoop.yarn.proto.YarnProtos.ApplicationIdProto;
 import org.apache.hadoop.yarn.proto.YarnProtos.NodeIdProto;
 import org.apache.hadoop.yarn.proto.YarnProtos.NodeLabelProto;
 import org.apache.hadoop.yarn.proto.YarnProtos.ResourceProto;
+import org.apache.hadoop.yarn.proto.YarnServerCommonServiceProtos.LogAggregationReportProto;
 import org.apache.hadoop.yarn.proto.YarnServerCommonServiceProtos.NMContainerStatusProto;
 import org.apache.hadoop.yarn.proto.YarnServerCommonServiceProtos.NodeLabelsProto;
 import org.apache.hadoop.yarn.proto.YarnServerCommonServiceProtos.NodeLabelsProto.Builder;
 import org.apache.hadoop.yarn.proto.YarnServerCommonServiceProtos.RegisterNodeManagerRequestProto;
 import org.apache.hadoop.yarn.proto.YarnServerCommonServiceProtos.RegisterNodeManagerRequestProtoOrBuilder;
+import org.apache.hadoop.yarn.server.api.protocolrecords.LogAggregationReport;
 import org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus;
 import org.apache.hadoop.yarn.server.api.protocolrecords.RegisterNodeManagerRequest;
     
@@ -57,6 +59,8 @@ public class RegisterNodeManagerRequestPBImpl extends RegisterNodeManagerRequest
   private List<ApplicationId> runningApplications = null;
   private Set<NodeLabel> labels = null;
 
+  private List<LogAggregationReport> logAggregationReportsForApps = null;
+
   /** Physical resources in the node. */
   private Resource physicalResource = null;
 
@@ -100,6 +104,48 @@ public class RegisterNodeManagerRequestPBImpl extends RegisterNodeManagerRequest
     if (this.physicalResource != null) {
       builder.setPhysicalResource(convertToProtoFormat(this.physicalResource));
     }
+    if (this.logAggregationReportsForApps != null) {
+        addLogAggregationStatusForAppsToProto();
+    }
+  }
+
+  private void addLogAggregationStatusForAppsToProto() {
+    maybeInitBuilder();
+    builder.clearLogAggregationReportsForApps();
+    if (this.logAggregationReportsForApps == null) {
+      return;
+    }
+    Iterable<LogAggregationReportProto> it =
+        new Iterable<LogAggregationReportProto>() {
+          @Override
+          public Iterator<LogAggregationReportProto> iterator() {
+            return new Iterator<LogAggregationReportProto>() {
+              private Iterator<LogAggregationReport> iter =
+                  logAggregationReportsForApps.iterator();
+
+              @Override
+              public boolean hasNext() {
+                return iter.hasNext();
+              }
+
+              @Override
+              public LogAggregationReportProto next() {
+                return convertToProtoFormat(iter.next());
+              }
+
+              @Override
+              public void remove() {
+                throw new UnsupportedOperationException();
+              }
+            };
+          }
+        };
+    builder.addAllLogAggregationReportsForApps(it);
+  }
+
+  private LogAggregationReportProto convertToProtoFormat(
+      LogAggregationReport value) {
+    return ((LogAggregationReportPBImpl) value).getProto();
   }
 
   private synchronized void addNMContainerStatusesToProto() {
@@ -400,4 +446,37 @@ public class RegisterNodeManagerRequestPBImpl extends RegisterNodeManagerRequest
       NMContainerStatus c) {
     return ((NMContainerStatusPBImpl)c).getProto();
   }
+
+  @Override
+  public List<LogAggregationReport> getLogAggregationReportsForApps() {
+    if (this.logAggregationReportsForApps != null) {
+      return this.logAggregationReportsForApps;
+    }
+    initLogAggregationReportsForApps();
+    return logAggregationReportsForApps;
+  }
+
+  private void initLogAggregationReportsForApps() {
+    RegisterNodeManagerRequestProtoOrBuilder p = viaProto ? proto : builder;
+    List<LogAggregationReportProto> list =
+        p.getLogAggregationReportsForAppsList();
+    this.logAggregationReportsForApps = new ArrayList<LogAggregationReport>();
+    for (LogAggregationReportProto c : list) {
+      this.logAggregationReportsForApps.add(convertFromProtoFormat(c));
+    }
+  }
+
+  private LogAggregationReport convertFromProtoFormat(
+      LogAggregationReportProto logAggregationReport) {
+    return new LogAggregationReportPBImpl(logAggregationReport);
+  }
+
+  @Override
+  public void setLogAggregationReportsForApps(
+      List<LogAggregationReport> logAggregationStatusForApps) {
+    if(logAggregationStatusForApps == null) {
+      builder.clearLogAggregationReportsForApps();
+    }
+    this.logAggregationReportsForApps = logAggregationStatusForApps;
+  }
 }

+ 1 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/proto/yarn_server_common_service_protos.proto

@@ -66,6 +66,7 @@ message RegisterNodeManagerRequestProto {
   repeated ApplicationIdProto runningApplications = 7;
   optional NodeLabelsProto nodeLabels = 8;
   optional ResourceProto physicalResource = 9;
+  repeated LogAggregationReportProto log_aggregation_reports_for_apps = 10;
 }
 
 message RegisterNodeManagerResponseProto {

+ 41 - 23
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/ContainerExecutor.java

@@ -27,6 +27,7 @@ import java.net.UnknownHostException;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
+import java.util.LinkedHashSet;
 import java.util.Map;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.ConcurrentMap;
@@ -316,14 +317,15 @@ public abstract class ContainerExecutor implements Configurable {
    * @param command the command that will be run
    * @param logDir the log dir to which to copy debugging information
    * @param user the username of the job owner
+   * @param nmVars the set of environment vars that are explicitly set by NM
    * @throws IOException if any errors happened writing to the OutputStream,
    * while creating symlinks
    */
   public void writeLaunchEnv(OutputStream out, Map<String, String> environment,
       Map<Path, List<String>> resources, List<String> command, Path logDir,
-      String user) throws IOException {
+      String user, LinkedHashSet<String> nmVars) throws IOException {
     this.writeLaunchEnv(out, environment, resources, command, logDir, user,
-        ContainerLaunch.CONTAINER_SCRIPT);
+        ContainerLaunch.CONTAINER_SCRIPT, nmVars);
   }
 
   /**
@@ -339,14 +341,15 @@ public abstract class ContainerExecutor implements Configurable {
    * @param logDir the log dir to which to copy debugging information
    * @param user the username of the job owner
    * @param outFilename the path to which to write the launch environment
+   * @param nmVars the set of environment vars that are explicitly set by NM
    * @throws IOException if any errors happened writing to the OutputStream,
    * while creating symlinks
    */
   @VisibleForTesting
   public void writeLaunchEnv(OutputStream out, Map<String, String> environment,
       Map<Path, List<String>> resources, List<String> command, Path logDir,
-      String user, String outFilename) throws IOException {
-    updateEnvForWhitelistVars(environment);
+      String user, String outFilename, LinkedHashSet<String> nmVars)
+      throws IOException {
 
     ContainerLaunch.ShellScriptBuilder sb =
         ContainerLaunch.ShellScriptBuilder.create();
@@ -361,9 +364,41 @@ public abstract class ContainerExecutor implements Configurable {
 
     if (environment != null) {
       sb.echo("Setting up env variables");
+      // Whitelist environment variables are treated specially.
+      // Only add them if they are not already defined in the environment.
+      // Add them using special syntax to prevent them from eclipsing
+      // variables that may be set explicitly in the container image (e.g,
+      // in a docker image).  Put these before the others to ensure the
+      // correct expansion is used.
+      for(String var : whitelistVars) {
+        if (!environment.containsKey(var)) {
+          String val = getNMEnvVar(var);
+          if (val != null) {
+            sb.whitelistedEnv(var, val);
+          }
+        }
+      }
+      // Now write vars that were set explicitly by nodemanager, preserving
+      // the order they were written in.
+      for (String nmEnvVar : nmVars) {
+        sb.env(nmEnvVar, environment.get(nmEnvVar));
+      }
+      // Now write the remaining environment variables.
       for (Map.Entry<String, String> env :
-          sb.orderEnvByDependencies(environment).entrySet()) {
-        sb.env(env.getKey(), env.getValue());
+           sb.orderEnvByDependencies(environment).entrySet()) {
+        if (!nmVars.contains(env.getKey())) {
+          sb.env(env.getKey(), env.getValue());
+        }
+      }
+      // Add the whitelist vars to the environment.  Do this after writing
+      // environment variables so they are not written twice.
+      for(String var : whitelistVars) {
+        if (!environment.containsKey(var)) {
+          String val = getNMEnvVar(var);
+          if (val != null) {
+            environment.put(var, val);
+          }
+        }
       }
     }
 
@@ -664,23 +699,6 @@ public abstract class ContainerExecutor implements Configurable {
     }
   }
 
-  /**
-   * Propagate variables from the nodemanager's environment into the
-   * container's environment if unspecified by the container.
-   * @param env the environment to update
-   * @see org.apache.hadoop.yarn.conf.YarnConfiguration#NM_ENV_WHITELIST
-   */
-  protected void updateEnvForWhitelistVars(Map<String, String> env) {
-    for(String var : whitelistVars) {
-      if (!env.containsKey(var)) {
-        String val = getNMEnvVar(var);
-        if (val != null) {
-          env.put(var, val);
-        }
-      }
-    }
-  }
-
   @VisibleForTesting
   protected String getNMEnvVar(String varname) {
     return System.getenv(varname);

+ 3 - 1
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/Context.java

@@ -33,7 +33,7 @@ import org.apache.hadoop.yarn.server.api.records.NodeHealthStatus;
 import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManager;
 import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application;
 import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
-
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.logaggregation.tracker.NMLogAggregationStatusTracker;
 import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePluginManager;
 import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics;
 import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService;
@@ -121,6 +121,8 @@ public interface Context {
 
   NMTimelinePublisher getNMTimelinePublisher();
 
+  NMLogAggregationStatusTracker getNMLogAggregationStatusTracker();
+
   ContainerExecutor getContainerExecutor();
 
   ContainerStateTransitionListener getContainerStateTransitionListener();

Some files were not shown because too many files changed in this diff