Browse Source

HADOOP-19057. S3A: Landsat bucket used in tests no longer accessible (#6515)

The AWS landsat data previously used in some S3A tests is no
longer accessible

This PR moves to the new external file
s3a://noaa-cors-pds/raw/2024/001/akse/AKSE001x.24_.gz

* Large enough file for scale tests
* Bucket supports anonymous access
* Ends in .gz to keep codec tests happy
* No spaces in path to keep bucket-info happy

Test Code Changes
* Leaves the test key name alone: fs.s3a.scale.test.csvfile
* Rename all methods and fields move remove "csv" from their names and
  move to "external file" we no longer require it to be CSV.
* Path definition and helper methods have been moved to PublicDatasetTestUtils
* Improve error reporting in ITestS3AInputStreamPerformance if the file
  is too short

With S3 Select removed, there is no need for the file to be
a CSV file; there is a test which tries to unzip it; other
tests have a minimum file size.

Consult the JIRA for the settings to add to auth-keys.xml
to switch earlier builds to this same file.

Contributed by Steve Loughran
Steve Loughran 1 năm trước cách đây
mục cha
commit
f1927ede7c
30 tập tin đã thay đổi với 362 bổ sung298 xóa
  1. 1 1
      hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/assumed_roles.md
  2. 4 4
      hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/auditing.md
  3. 3 30
      hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/committers.md
  4. 4 5
      hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/connecting.md
  5. 1 1
      hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/delegation_token_architecture.md
  6. 7 7
      hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/delegation_tokens.md
  7. 14 16
      hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/directory_markers.md
  8. 7 8
      hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/encryption.md
  9. 3 3
      hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/index.md
  10. 4 3
      hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/performance.md
  11. 11 13
      hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/testing.md
  12. 2 2
      hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AAWSCredentialsProvider.java
  13. 14 23
      hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AFailureHandling.java
  14. 47 38
      hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3APrefetchingCacheFiles.java
  15. 6 4
      hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/S3ATestConstants.java
  16. 14 11
      hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/S3ATestUtils.java
  17. 9 12
      hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/TestS3AAWSCredentialsProvider.java
  18. 3 3
      hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/adapter/TestV1CredentialsProvider.java
  19. 2 2
      hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/ITestAssumeRole.java
  20. 21 10
      hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/delegation/ITestDelegatedMRJob.java
  21. 2 3
      hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/delegation/ITestRoleDelegationInFilesystem.java
  22. 9 8
      hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/delegation/ITestSessionDelegationInFilesystem.java
  23. 8 8
      hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/delegation/TestS3ADelegationTokenSupport.java
  24. 2 1
      hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/staging/TestPaths.java
  25. 20 22
      hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/ITestS3GuardTool.java
  26. 3 2
      hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/TestAuthoritativePath.java
  27. 15 3
      hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/scale/ITestS3AInputStreamPerformance.java
  28. 14 45
      hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/statistics/ITestAWSStatisticCollection.java
  29. 82 0
      hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/test/PublicDatasetTestUtils.java
  30. 30 10
      hadoop-tools/hadoop-aws/src/test/resources/core-site.xml

+ 1 - 1
hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/assumed_roles.md

@@ -585,7 +585,7 @@ If an operation fails with an `AccessDeniedException`, then the role does not ha
 the permission for the S3 Operation invoked during the call.
 
 ```
-> hadoop fs -touch  s3a://landsat-pds/a
+> hadoop fs -touch  s3a://noaa-isd-pds/a
 
 java.nio.file.AccessDeniedException: a: Writing Object on a:
  software.amazon.awssdk.services.s3.model.S3Exception: Access Denied

+ 4 - 4
hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/auditing.md

@@ -111,9 +111,9 @@ Specific buckets can have auditing disabled, even when it is enabled globally.
 
 ```xml
 <property>
-  <name>fs.s3a.bucket.landsat-pds.audit.enabled</name>
+  <name>fs.s3a.bucket.noaa-isd-pds.audit.enabled</name>
   <value>false</value>
-  <description>Do not audit landsat bucket operations</description>
+  <description>Do not audit bucket operations</description>
 </property>
 ```
 
@@ -342,9 +342,9 @@ either globally or for specific buckets:
 </property>
 
 <property>
-  <name>fs.s3a.bucket.landsat-pds.audit.referrer.enabled</name>
+  <name>fs.s3a.bucket.noaa-isd-pds.audit.referrer.enabled</name>
   <value>false</value>
-  <description>Do not add the referrer header to landsat operations</description>
+  <description>Do not add the referrer header to operations</description>
 </property>
 ```
 

+ 3 - 30
hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/committers.md

@@ -747,7 +747,7 @@ For example, for any job executed through Hadoop MapReduce, the Job ID can be us
 ### `Filesystem does not have support for 'magic' committer`
 
 ```
-org.apache.hadoop.fs.s3a.commit.PathCommitException: `s3a://landsat-pds': Filesystem does not have support for 'magic' committer enabled
+org.apache.hadoop.fs.s3a.commit.PathCommitException: `s3a://noaa-isd-pds': Filesystem does not have support for 'magic' committer enabled
 in configuration option fs.s3a.committer.magic.enabled
 ```
 
@@ -760,42 +760,15 @@ Remove all global/per-bucket declarations of `fs.s3a.bucket.magic.enabled` or se
 
 ```xml
 <property>
-  <name>fs.s3a.bucket.landsat-pds.committer.magic.enabled</name>
+  <name>fs.s3a.bucket.noaa-isd-pds.committer.magic.enabled</name>
   <value>true</value>
 </property>
 ```
 
 Tip: you can verify that a bucket supports the magic committer through the
-`hadoop s3guard bucket-info` command:
+`hadoop s3guard bucket-info` command.
 
 
-```
-> hadoop s3guard bucket-info -magic s3a://landsat-pds/
-Location: us-west-2
-
-S3A Client
-        Signing Algorithm: fs.s3a.signing-algorithm=(unset)
-        Endpoint: fs.s3a.endpoint=s3.amazonaws.com
-        Encryption: fs.s3a.encryption.algorithm=none
-        Input seek policy: fs.s3a.experimental.input.fadvise=normal
-        Change Detection Source: fs.s3a.change.detection.source=etag
-        Change Detection Mode: fs.s3a.change.detection.mode=server
-
-S3A Committers
-        The "magic" committer is supported in the filesystem
-        S3A Committer factory class: mapreduce.outputcommitter.factory.scheme.s3a=org.apache.hadoop.fs.s3a.commit.S3ACommitterFactory
-        S3A Committer name: fs.s3a.committer.name=magic
-        Store magic committer integration: fs.s3a.committer.magic.enabled=true
-
-Security
-        Delegation token support is disabled
-
-Directory Markers
-        The directory marker policy is "keep"
-        Available Policies: delete, keep, authoritative
-        Authoritative paths: fs.s3a.authoritative.path=```
-```
-
 ### Error message: "File being created has a magic path, but the filesystem has magic file support disabled"
 
 A file is being written to a path which is used for "magic" files,

+ 4 - 5
hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/connecting.md

@@ -284,14 +284,13 @@ a bucket.
 The up to date list of regions is [Available online](https://docs.aws.amazon.com/general/latest/gr/s3.html).
 
 This list can be used to specify the endpoint of individual buckets, for example
-for buckets in the central and EU/Ireland endpoints.
+for buckets in the us-west-2 and EU/Ireland endpoints.
 
 
 ```xml
 <property>
-  <name>fs.s3a.bucket.landsat-pds.endpoint.region</name>
+  <name>fs.s3a.bucket.us-west-2-dataset.endpoint.region</name>
   <value>us-west-2</value>
-  <description>The region for s3a://landsat-pds URLs</description>
 </property>
 
 <property>
@@ -354,9 +353,9 @@ The boolean option `fs.s3a.endpoint.fips` (default `false`) switches the S3A con
 For a single bucket:
 ```xml
 <property>
-  <name>fs.s3a.bucket.landsat-pds.endpoint.fips</name>
+  <name>fs.s3a.bucket.noaa-isd-pds.endpoint.fips</name>
   <value>true</value>
-  <description>Use the FIPS endpoint for the landsat dataset</description>
+  <description>Use the FIPS endpoint for the NOAA dataset</description>
 </property>
 ```
 

+ 1 - 1
hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/delegation_token_architecture.md

@@ -188,7 +188,7 @@ If it was deployed unbonded, the DT Binding is asked to create a new DT.
 
 It is up to the binding what it includes in the token identifier, and how it obtains them.
 This new token identifier is included in a token which has a "canonical service name" of
-the URI of the filesystem (e.g "s3a://landsat-pds").
+the URI of the filesystem (e.g "s3a://noaa-isd-pds").
 
 The issued/reissued token identifier can be marshalled and reused.
 

+ 7 - 7
hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/delegation_tokens.md

@@ -481,8 +481,8 @@ This will fetch the token and save it to the named file (here, `tokens.bin`),
 even if Kerberos is disabled.
 
 ```bash
-# Fetch a token for the AWS landsat-pds bucket and save it to tokens.bin
-$ hdfs fetchdt --webservice s3a://landsat-pds/  tokens.bin
+# Fetch a token for the AWS noaa-isd-pds bucket and save it to tokens.bin
+$ hdfs fetchdt --webservice s3a://noaa-isd-pds/ tokens.bin
 ```
 
 If the command fails with `ERROR: Failed to fetch token` it means the
@@ -498,11 +498,11 @@ host on which it was created.
 ```bash
 $ bin/hdfs fetchdt --print tokens.bin
 
-Token (S3ATokenIdentifier{S3ADelegationToken/Session; uri=s3a://landsat-pds;
+Token (S3ATokenIdentifier{S3ADelegationToken/Session; uri=s3a://noaa-isd-pds;
 timestamp=1541683947569; encryption=EncryptionSecrets{encryptionMethod=SSE_S3};
 Created on vm1.local/192.168.99.1 at time 2018-11-08T13:32:26.381Z.};
 Session credentials for user AAABWL expires Thu Nov 08 14:02:27 GMT 2018; (valid))
-for s3a://landsat-pds
+for s3a://noaa-isd-pds
 ```
 The "(valid)" annotation means that the AWS credentials are considered "valid":
 there is both a username and a secret.
@@ -513,11 +513,11 @@ If delegation support is enabled, it also prints the current
 hadoop security level.
 
 ```bash
-$ hadoop s3guard bucket-info s3a://landsat-pds/
+$ hadoop s3guard bucket-info s3a://noaa-isd-pds/
 
-Filesystem s3a://landsat-pds
+Filesystem s3a://noaa-isd-pds
 Location: us-west-2
-Filesystem s3a://landsat-pds is not using S3Guard
+Filesystem s3a://noaa-isd-pds is not using S3Guard
 The "magic" committer is not supported
 
 S3A Client

+ 14 - 16
hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/directory_markers.md

@@ -314,9 +314,8 @@ All releases of Hadoop which have been updated to be marker aware will support t
 Example: `s3guard bucket-info -markers aware` on a compatible release.
 
 ```
-> hadoop s3guard bucket-info -markers aware s3a://landsat-pds/
-Filesystem s3a://landsat-pds
-Location: us-west-2
+> hadoop s3guard bucket-info -markers aware s3a://noaa-isd-pds/
+Filesystem s3a://noaa-isd-pds
 
 ...
 
@@ -326,13 +325,14 @@ Directory Markers
         Authoritative paths: fs.s3a.authoritative.path=
         The S3A connector is compatible with buckets where directory markers are not deleted
 
+...
 ```
 
 The same command will fail on older releases, because the `-markers` option
 is unknown
 
 ```
-> hadoop s3guard bucket-info -markers aware s3a://landsat-pds/
+> hadoop s3guard bucket-info -markers aware s3a://noaa-isd-pds/
 Illegal option -markers
 Usage: hadoop bucket-info [OPTIONS] s3a://BUCKET
     provide/check information about a specific bucket
@@ -354,9 +354,8 @@ Generic options supported are:
 A specific policy check verifies that the connector is configured as desired
 
 ```
-> hadoop s3guard bucket-info -markers keep s3a://landsat-pds/
-Filesystem s3a://landsat-pds
-Location: us-west-2
+> hadoop s3guard bucket-info -markers keep s3a://noaa-isd-pds/
+Filesystem s3a://noaa-isd-pds
 
 ...
 
@@ -371,9 +370,8 @@ When probing for a specific policy, the error code "46" is returned if the activ
 does not match that requested:
 
 ```
-> hadoop s3guard bucket-info -markers delete s3a://landsat-pds/
-Filesystem s3a://landsat-pds
-Location: us-west-2
+> hadoop s3guard bucket-info -markers delete s3a://noaa-isd-pds/
+Filesystem s3a://noaa-isd-pds
 
 S3A Client
         Signing Algorithm: fs.s3a.signing-algorithm=(unset)
@@ -398,7 +396,7 @@ Directory Markers
         Authoritative paths: fs.s3a.authoritative.path=
 
 2021-11-22 16:03:59,175 [main] INFO  util.ExitUtil (ExitUtil.java:terminate(210))
- -Exiting with status 46: 46: Bucket s3a://landsat-pds: required marker polic is
+ -Exiting with status 46: 46: Bucket s3a://noaa-isd-pds: required marker polic is
   "keep" but actual policy is "delete"
 
 ```
@@ -450,10 +448,10 @@ Audit the path and fail if any markers were found.
 
 
 ```
-> hadoop s3guard markers -limit 8000 -audit s3a://landsat-pds/
+> hadoop s3guard markers -limit 8000 -audit s3a://noaa-isd-pds/
 
-The directory marker policy of s3a://landsat-pds is "Keep"
-2020-08-05 13:42:56,079 [main] INFO  tools.MarkerTool (DurationInfo.java:<init>(77)) - Starting: marker scan s3a://landsat-pds/
+The directory marker policy of s3a://noaa-isd-pds is "Keep"
+2020-08-05 13:42:56,079 [main] INFO  tools.MarkerTool (DurationInfo.java:<init>(77)) - Starting: marker scan s3a://noaa-isd-pds/
 Scanned 1,000 objects
 Scanned 2,000 objects
 Scanned 3,000 objects
@@ -463,8 +461,8 @@ Scanned 6,000 objects
 Scanned 7,000 objects
 Scanned 8,000 objects
 Limit of scan reached - 8,000 objects
-2020-08-05 13:43:01,184 [main] INFO  tools.MarkerTool (DurationInfo.java:close(98)) - marker scan s3a://landsat-pds/: duration 0:05.107s
-No surplus directory markers were found under s3a://landsat-pds/
+2020-08-05 13:43:01,184 [main] INFO  tools.MarkerTool (DurationInfo.java:close(98)) - marker scan s3a://noaa-isd-pds/: duration 0:05.107s
+No surplus directory markers were found under s3a://noaa-isd-pds/
 Listing limit reached before completing the scan
 2020-08-05 13:43:01,187 [main] INFO  util.ExitUtil (ExitUtil.java:terminate(210)) - Exiting with status 3:
 ```

+ 7 - 8
hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/encryption.md

@@ -616,15 +616,14 @@ header.x-amz-version-id="KcDOVmznIagWx3gP1HlDqcZvm1mFWZ2a"
 A file with no-encryption (on a bucket without versioning but with intelligent tiering):
 
 ```
-bin/hadoop fs -getfattr -d s3a://landsat-pds/scene_list.gz
+ bin/hadoop fs -getfattr -d s3a://noaa-cors-pds/raw/2024/001/akse/AKSE001x.24_.gz
 
-# file: s3a://landsat-pds/scene_list.gz
-header.Content-Length="45603307"
-header.Content-Type="application/octet-stream"
-header.ETag="39c34d489777a595b36d0af5726007db"
-header.Last-Modified="Wed Aug 29 01:45:15 BST 2018"
-header.x-amz-storage-class="INTELLIGENT_TIERING"
-header.x-amz-version-id="null"
+# file: s3a://noaa-cors-pds/raw/2024/001/akse/AKSE001x.24_.gz
+header.Content-Length="524671"
+header.Content-Type="binary/octet-stream"
+header.ETag=""3e39531220fbd3747d32cf93a79a7a0c""
+header.Last-Modified="Tue Jan 02 00:15:13 GMT 2024"
+header.x-amz-server-side-encryption="AES256"
 ```
 
 ###<a name="changing-encryption"></a> Use `rename()` to encrypt files with new keys

+ 3 - 3
hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/index.md

@@ -503,7 +503,7 @@ explicitly opened up for broader access.
 ```bash
 hadoop fs -ls \
  -D fs.s3a.aws.credentials.provider=org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider \
- s3a://landsat-pds/
+ s3a://noaa-isd-pds/
 ```
 
 1. Allowing anonymous access to an S3 bucket compromises
@@ -1630,11 +1630,11 @@ a session key:
 </property>
 ```
 
-Finally, the public `s3a://landsat-pds/` bucket can be accessed anonymously:
+Finally, the public `s3a://noaa-isd-pds/` bucket can be accessed anonymously:
 
 ```xml
 <property>
-  <name>fs.s3a.bucket.landsat-pds.aws.credentials.provider</name>
+  <name>fs.s3a.bucket.noaa-isd-pds.aws.credentials.provider</name>
   <value>org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider</value>
 </property>
 ```

+ 4 - 3
hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/performance.md

@@ -447,7 +447,8 @@ An example of this is covered in [HADOOP-13871](https://issues.apache.org/jira/b
 
 1. For public data, use `curl`:
 
-        curl -O https://landsat-pds.s3.amazonaws.com/scene_list.gz
+        curl -O https://noaa-cors-pds.s3.amazonaws.com/raw/2023/001/akse/AKSE001a.23_.gz
+
 1. Use `nettop` to monitor a processes connections.
 
 
@@ -696,7 +697,7 @@ via `FileSystem.get()` or `Path.getFileSystem()`.
 The cache, `FileSystem.CACHE` will, for each user, cachec one instance of a filesystem
 for a given URI.
 All calls to `FileSystem.get` for a cached FS for a URI such
-as `s3a://landsat-pds/` will return that singe single instance.
+as `s3a://noaa-isd-pds/` will return that singe single instance.
 
 FileSystem instances are created on-demand for the cache,
 and will be done in each thread which requests an instance.
@@ -720,7 +721,7 @@ can be created simultaneously for different object stores/distributed
 filesystems.
 
 For example, a value of four would put an upper limit on the number
-of wasted instantiations of a connector for the `s3a://landsat-pds/`
+of wasted instantiations of a connector for the `s3a://noaa-isd-pds/`
 bucket.
 
 ```xml

+ 11 - 13
hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/testing.md

@@ -260,22 +260,20 @@ define the target region in `auth-keys.xml`.
 ### <a name="csv"></a> CSV Data Tests
 
 The `TestS3AInputStreamPerformance` tests require read access to a multi-MB
-text file. The default file for these tests is one published by amazon,
-[s3a://landsat-pds.s3.amazonaws.com/scene_list.gz](http://landsat-pds.s3.amazonaws.com/scene_list.gz).
-This is a gzipped CSV index of other files which amazon serves for open use.
+text file. The default file for these tests is a public one.
+`s3a://noaa-cors-pds/raw/2023/001/akse/AKSE001a.23_.gz`
+from the [NOAA Continuously Operating Reference Stations (CORS) Network (NCN)](https://registry.opendata.aws/noaa-ncn/)
 
 Historically it was required to be a `csv.gz` file to validate S3 Select
 support. Now that S3 Select support has been removed, other large files
 may be used instead.
-However, future versions may want to read a CSV file again, so testers
-should still reference one.
 
 The path to this object is set in the option `fs.s3a.scale.test.csvfile`,
 
 ```xml
 <property>
   <name>fs.s3a.scale.test.csvfile</name>
-  <value>s3a://landsat-pds/scene_list.gz</value>
+  <value>s3a://noaa-cors-pds/raw/2023/001/akse/AKSE001a.23_.gz</value>
 </property>
 ```
 
@@ -285,6 +283,7 @@ is hosted in Amazon's US-east datacenter.
 1. If the data cannot be read for any reason then the test will fail.
 1. If the property is set to a different path, then that data must be readable
 and "sufficiently" large.
+1. If a `.gz` file, expect decompression-related test failures.
 
 (the reason the space or newline is needed is to add "an empty entry"; an empty
 `<value/>` would be considered undefined and pick up the default)
@@ -292,14 +291,13 @@ and "sufficiently" large.
 
 If using a test file in a different AWS S3 region then
 a bucket-specific region must be defined.
-For the default test dataset, hosted in the `landsat-pds` bucket, this is:
+For the default test dataset, hosted in the `noaa-cors-pds` bucket, this is:
 
 ```xml
-<property>
-  <name>fs.s3a.bucket.landsat-pds.endpoint.region</name>
-  <value>us-west-2</value>
-  <description>The region for s3a://landsat-pds</description>
-</property>
+  <property>
+    <name>fs.s3a.bucket.noaa-cors-pds.endpoint.region</name>
+    <value>us-east-1</value>
+  </property>
 ```
 
 ### <a name="access"></a> Testing Access Point Integration
@@ -857,7 +855,7 @@ the tests become skipped, rather than fail with a trace which is really a false
 The ordered test case mechanism of `AbstractSTestS3AHugeFiles` is probably
 the most elegant way of chaining test setup/teardown.
 
-Regarding reusing existing data, we tend to use the landsat archive of
+Regarding reusing existing data, we tend to use the noaa-cors-pds archive of
 AWS US-East for our testing of input stream operations. This doesn't work
 against other regions, or with third party S3 implementations. Thus the
 URL can be overridden for testing elsewhere.

+ 2 - 2
hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AAWSCredentialsProvider.java

@@ -40,10 +40,10 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 import static org.apache.hadoop.fs.s3a.Constants.*;
-import static org.apache.hadoop.fs.s3a.S3ATestUtils.getCSVTestPath;
 import static org.apache.hadoop.fs.s3a.S3ATestUtils.removeBaseAndBucketOverrides;
 import static org.apache.hadoop.fs.s3a.auth.delegation.DelegationConstants.DELEGATION_TOKEN_BINDING;
 import static org.apache.hadoop.fs.s3a.impl.InstantiationIOException.CONSTRUCTOR_EXCEPTION;
+import static org.apache.hadoop.fs.s3a.test.PublicDatasetTestUtils.getExternalData;
 import static org.apache.hadoop.test.LambdaTestUtils.intercept;
 import static org.junit.Assert.*;
 
@@ -207,7 +207,7 @@ public class ITestS3AAWSCredentialsProvider {
   @Test
   public void testAnonymousProvider() throws Exception {
     Configuration conf = createConf(AnonymousAWSCredentialsProvider.class);
-    Path testFile = getCSVTestPath(conf);
+    Path testFile = getExternalData(conf);
     try (FileSystem fs = FileSystem.newInstance(testFile.toUri(), conf)) {
       Assertions.assertThat(fs)
           .describedAs("Filesystem")

+ 14 - 23
hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AFailureHandling.java

@@ -22,7 +22,6 @@ import software.amazon.awssdk.services.s3.model.ObjectIdentifier;
 import software.amazon.awssdk.services.s3.model.S3Error;
 
 import org.assertj.core.api.Assertions;
-import org.junit.Assume;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.LocatedFileStatus;
@@ -47,6 +46,7 @@ import static org.apache.hadoop.fs.contract.ContractTestUtils.*;
 import static org.apache.hadoop.fs.s3a.S3ATestUtils.createFiles;
 import static org.apache.hadoop.fs.s3a.S3ATestUtils.isBulkDeleteEnabled;
 import static org.apache.hadoop.fs.s3a.test.ExtraAssertions.failIf;
+import static org.apache.hadoop.fs.s3a.test.PublicDatasetTestUtils.requireDefaultExternalData;
 import static org.apache.hadoop.test.LambdaTestUtils.*;
 import static org.apache.hadoop.util.functional.RemoteIterators.mappingRemoteIterator;
 import static org.apache.hadoop.util.functional.RemoteIterators.toList;
@@ -156,31 +156,22 @@ public class ITestS3AFailureHandling extends AbstractS3ATestBase {
     timer.end("removeKeys");
   }
 
-
-  private Path maybeGetCsvPath() {
-    Configuration conf = getConfiguration();
-    String csvFile = conf.getTrimmed(KEY_CSVTEST_FILE, DEFAULT_CSVTEST_FILE);
-    Assume.assumeTrue("CSV test file is not the default",
-        DEFAULT_CSVTEST_FILE.equals(csvFile));
-    return new Path(csvFile);
-  }
-
   /**
    * Test low-level failure handling with low level delete request.
    */
   @Test
   public void testMultiObjectDeleteNoPermissions() throws Throwable {
-    describe("Delete the landsat CSV file and expect it to fail");
-    Path csvPath = maybeGetCsvPath();
-    S3AFileSystem fs = (S3AFileSystem) csvPath.getFileSystem(
+    describe("Delete the external file and expect it to fail");
+    Path path = requireDefaultExternalData(getConfiguration());
+    S3AFileSystem fs = (S3AFileSystem) path.getFileSystem(
         getConfiguration());
     // create a span, expect it to be activated.
     fs.getAuditSpanSource().createSpan(StoreStatisticNames.OP_DELETE,
-        csvPath.toString(), null);
+        path.toString(), null);
     List<ObjectIdentifier> keys
         = buildDeleteRequest(
             new String[]{
-                fs.pathToKey(csvPath),
+                fs.pathToKey(path),
                 "missing-key.csv"
             });
     MultiObjectDeleteException ex = intercept(
@@ -193,10 +184,10 @@ public class ITestS3AFailureHandling extends AbstractS3ATestBase {
     final String undeletedFiles = undeleted.stream()
         .map(Path::toString)
         .collect(Collectors.joining(", "));
-    failIf(undeleted.size() != 2,
-        "undeleted list size wrong: " + undeletedFiles,
-        ex);
-    assertTrue("no CSV in " +undeletedFiles, undeleted.contains(csvPath));
+    Assertions.assertThat(undeleted)
+        .describedAs("undeleted files")
+        .hasSize(2)
+        .contains(path);
   }
 
   /**
@@ -205,12 +196,12 @@ public class ITestS3AFailureHandling extends AbstractS3ATestBase {
    */
   @Test
   public void testSingleObjectDeleteNoPermissionsTranslated() throws Throwable {
-    describe("Delete the landsat CSV file and expect it to fail");
-    Path csvPath = maybeGetCsvPath();
-    S3AFileSystem fs = (S3AFileSystem) csvPath.getFileSystem(
+    describe("Delete the external file and expect it to fail");
+    Path path = requireDefaultExternalData(getConfiguration());
+    S3AFileSystem fs = (S3AFileSystem) path.getFileSystem(
         getConfiguration());
     AccessDeniedException aex = intercept(AccessDeniedException.class,
-        () -> fs.delete(csvPath, false));
+        () -> fs.delete(path, false));
     Throwable cause = aex.getCause();
     failIf(cause == null, "no nested exception", aex);
   }

+ 47 - 38
hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3APrefetchingCacheFiles.java

@@ -19,8 +19,9 @@
 package org.apache.hadoop.fs.s3a;
 
 import java.io.File;
-import java.net.URI;
+import java.util.UUID;
 
+import org.assertj.core.api.Assertions;
 import org.junit.Before;
 import org.junit.Test;
 import org.slf4j.Logger;
@@ -30,15 +31,16 @@ import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FSDataInputStream;
 import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.LocalFileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.contract.ContractTestUtils;
 import org.apache.hadoop.fs.permission.FsAction;
 import org.apache.hadoop.fs.s3a.performance.AbstractS3ACostTest;
 
 import static org.apache.hadoop.fs.s3a.Constants.BUFFER_DIR;
-import static org.apache.hadoop.fs.s3a.Constants.PREFETCH_BLOCK_DEFAULT_SIZE;
 import static org.apache.hadoop.fs.s3a.Constants.PREFETCH_BLOCK_SIZE_KEY;
 import static org.apache.hadoop.fs.s3a.Constants.PREFETCH_ENABLED_KEY;
+import static org.apache.hadoop.fs.s3a.test.PublicDatasetTestUtils.getExternalData;
 import static org.apache.hadoop.io.IOUtils.cleanupWithLogger;
 
 /**
@@ -49,11 +51,21 @@ public class ITestS3APrefetchingCacheFiles extends AbstractS3ACostTest {
   private static final Logger LOG =
       LoggerFactory.getLogger(ITestS3APrefetchingCacheFiles.class);
 
+  /** use a small file size so small source files will still work. */
+  public static final int BLOCK_SIZE = 128 * 1024;
+
+  public static final int PREFETCH_OFFSET = 10240;
+
   private Path testFile;
+
+  /** The FS with the external file. */
   private FileSystem fs;
+
   private int prefetchBlockSize;
   private Configuration conf;
 
+  private String bufferDir;
+
   public ITestS3APrefetchingCacheFiles() {
     super(true);
   }
@@ -63,35 +75,31 @@ public class ITestS3APrefetchingCacheFiles extends AbstractS3ACostTest {
     super.setup();
     // Sets BUFFER_DIR by calling S3ATestUtils#prepareTestConfiguration
     conf = createConfiguration();
-    String testFileUri = S3ATestUtils.getCSVTestFile(conf);
 
-    testFile = new Path(testFileUri);
-    prefetchBlockSize = conf.getInt(PREFETCH_BLOCK_SIZE_KEY, PREFETCH_BLOCK_DEFAULT_SIZE);
-    fs = getFileSystem();
-    fs.initialize(new URI(testFileUri), conf);
+    testFile = getExternalData(conf);
+    prefetchBlockSize = conf.getInt(PREFETCH_BLOCK_SIZE_KEY, BLOCK_SIZE);
+    fs = FileSystem.get(testFile.toUri(), conf);
   }
 
   @Override
   public Configuration createConfiguration() {
     Configuration configuration = super.createConfiguration();
     S3ATestUtils.removeBaseAndBucketOverrides(configuration, PREFETCH_ENABLED_KEY);
-    S3ATestUtils.removeBaseAndBucketOverrides(configuration, PREFETCH_BLOCK_SIZE_KEY);
     configuration.setBoolean(PREFETCH_ENABLED_KEY, true);
+    // use a small block size unless explicitly set in the test config.
+    configuration.setInt(PREFETCH_BLOCK_SIZE_KEY, BLOCK_SIZE);
+    // patch buffer dir with a unique path for test isolation.
+    final String bufferDirBase = configuration.get(BUFFER_DIR);
+    bufferDir = bufferDirBase + "/" + UUID.randomUUID();
+    configuration.set(BUFFER_DIR, bufferDir);
     return configuration;
   }
 
   @Override
   public synchronized void teardown() throws Exception {
     super.teardown();
-    File tmpFileDir = new File(conf.get(BUFFER_DIR));
-    File[] tmpFiles = tmpFileDir.listFiles();
-    if (tmpFiles != null) {
-      for (File filePath : tmpFiles) {
-        String path = filePath.getPath();
-        if (path.endsWith(".bin") && path.contains("fs-cache-")) {
-          filePath.delete();
-        }
-      }
+    if (bufferDir != null) {
+      new File(bufferDir).delete();
     }
     cleanupWithLogger(LOG, fs);
     fs = null;
@@ -111,34 +119,35 @@ public class ITestS3APrefetchingCacheFiles extends AbstractS3ACostTest {
     try (FSDataInputStream in = fs.open(testFile)) {
       byte[] buffer = new byte[prefetchBlockSize];
 
-      in.read(buffer, 0, prefetchBlockSize - 10240);
-      in.seek(prefetchBlockSize * 2);
-      in.read(buffer, 0, prefetchBlockSize);
+      // read a bit less than a block
+      in.readFully(0, buffer, 0, prefetchBlockSize - PREFETCH_OFFSET);
+      // read at least some of a second block
+      in.read(prefetchBlockSize * 2, buffer, 0, prefetchBlockSize);
+
 
       File tmpFileDir = new File(conf.get(BUFFER_DIR));
-      assertTrue("The dir to keep cache files must exist", tmpFileDir.exists());
+      final LocalFileSystem localFs = FileSystem.getLocal(conf);
+      Path bufferDirPath = new Path(tmpFileDir.toURI());
+      ContractTestUtils.assertIsDirectory(localFs, bufferDirPath);
       File[] tmpFiles = tmpFileDir
           .listFiles((dir, name) -> name.endsWith(".bin") && name.contains("fs-cache-"));
-      boolean isCacheFileForBlockFound = tmpFiles != null && tmpFiles.length > 0;
-      if (!isCacheFileForBlockFound) {
-        LOG.warn("No cache files found under " + tmpFileDir);
-      }
-      assertTrue("File to cache block data must exist", isCacheFileForBlockFound);
+      Assertions.assertThat(tmpFiles)
+          .describedAs("Cache files not found under %s", tmpFileDir)
+          .isNotEmpty();
+
 
       for (File tmpFile : tmpFiles) {
         Path path = new Path(tmpFile.getAbsolutePath());
-        try (FileSystem localFs = FileSystem.getLocal(conf)) {
-          FileStatus stat = localFs.getFileStatus(path);
-          ContractTestUtils.assertIsFile(path, stat);
-          assertEquals("File length not matching with prefetchBlockSize", prefetchBlockSize,
-              stat.getLen());
-          assertEquals("User permissions should be RW", FsAction.READ_WRITE,
-              stat.getPermission().getUserAction());
-          assertEquals("Group permissions should be NONE", FsAction.NONE,
-              stat.getPermission().getGroupAction());
-          assertEquals("Other permissions should be NONE", FsAction.NONE,
-              stat.getPermission().getOtherAction());
-        }
+        FileStatus stat = localFs.getFileStatus(path);
+        ContractTestUtils.assertIsFile(path, stat);
+        assertEquals("File length not matching with prefetchBlockSize", prefetchBlockSize,
+            stat.getLen());
+        assertEquals("User permissions should be RW", FsAction.READ_WRITE,
+            stat.getPermission().getUserAction());
+        assertEquals("Group permissions should be NONE", FsAction.NONE,
+            stat.getPermission().getGroupAction());
+        assertEquals("Other permissions should be NONE", FsAction.NONE,
+            stat.getPermission().getOtherAction());
       }
     }
   }

+ 6 - 4
hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/S3ATestConstants.java

@@ -111,14 +111,16 @@ public interface S3ATestConstants {
   String KEY_CSVTEST_FILE = S3A_SCALE_TEST + "csvfile";
 
   /**
-   * The landsat bucket: {@value}.
+   * Default path for the multi MB test file: {@value}.
+   * @deprecated retrieve via {@link PublicDatasetTestUtils}.
    */
-  String LANDSAT_BUCKET = "s3a://landsat-pds/";
+  @Deprecated
+  String DEFAULT_CSVTEST_FILE = PublicDatasetTestUtils.DEFAULT_EXTERNAL_FILE;
 
   /**
-   * Default path for the multi MB test file: {@value}.
+   * Example path for unit tests; this is never accessed: {@value}.
    */
-  String DEFAULT_CSVTEST_FILE = LANDSAT_BUCKET + "scene_list.gz";
+  String UNIT_TEST_EXAMPLE_PATH = "s3a://example/data/";
 
   /**
    * Configuration key for an existing object in a requester pays bucket: {@value}.

+ 14 - 11
hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/S3ATestUtils.java

@@ -105,6 +105,8 @@ import static org.apache.hadoop.fs.contract.ContractTestUtils.createFile;
 import static org.apache.hadoop.fs.s3a.impl.CallableSupplier.submit;
 import static org.apache.hadoop.fs.s3a.impl.CallableSupplier.waitForCompletion;
 import static org.apache.hadoop.fs.s3a.impl.S3ExpressStorage.STORE_CAPABILITY_S3_EXPRESS_STORAGE;
+import static org.apache.hadoop.fs.s3a.test.PublicDatasetTestUtils.getExternalData;
+import static org.apache.hadoop.fs.s3a.test.PublicDatasetTestUtils.requireDefaultExternalDataFile;
 import static org.apache.hadoop.test.GenericTestUtils.buildPaths;
 import static org.apache.hadoop.util.Preconditions.checkNotNull;
 import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.HADOOP_SECURITY_CREDENTIAL_PROVIDER_PATH;
@@ -405,22 +407,22 @@ public final class S3ATestUtils {
    * Get the test CSV file; assume() that it is not empty.
    * @param conf test configuration
    * @return test file.
+   * @deprecated Retained only to assist cherrypicking patches
    */
+  @Deprecated
   public static String getCSVTestFile(Configuration conf) {
-    String csvFile = conf
-        .getTrimmed(KEY_CSVTEST_FILE, DEFAULT_CSVTEST_FILE);
-    Assume.assumeTrue("CSV test file is not the default",
-        isNotEmpty(csvFile));
-    return csvFile;
+    return getExternalData(conf).toUri().toString();
   }
 
   /**
    * Get the test CSV path; assume() that it is not empty.
    * @param conf test configuration
    * @return test file as a path.
+   * @deprecated Retained only to assist cherrypicking patches
    */
+  @Deprecated
   public static Path getCSVTestPath(Configuration conf) {
-    return new Path(getCSVTestFile(conf));
+    return getExternalData(conf);
   }
 
   /**
@@ -429,12 +431,11 @@ public final class S3ATestUtils {
    * read only).
    * @return test file.
    * @param conf test configuration
+   * @deprecated Retained only to assist cherrypicking patches
    */
+  @Deprecated
   public static String getLandsatCSVFile(Configuration conf) {
-    String csvFile = getCSVTestFile(conf);
-    Assume.assumeTrue("CSV test file is not the default",
-        DEFAULT_CSVTEST_FILE.equals(csvFile));
-    return csvFile;
+    return requireDefaultExternalDataFile(conf);
   }
   /**
    * Get the test CSV file; assume() that it is not modified (i.e. we haven't
@@ -442,9 +443,11 @@ public final class S3ATestUtils {
    * read only).
    * @param conf test configuration
    * @return test file as a path.
+   * @deprecated Retained only to assist cherrypicking patches
    */
+  @Deprecated
   public static Path getLandsatCSVPath(Configuration conf) {
-    return new Path(getLandsatCSVFile(conf));
+    return getExternalData(conf);
   }
 
   /**

+ 9 - 12
hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/TestS3AAWSCredentialsProvider.java

@@ -54,37 +54,34 @@ import org.apache.hadoop.fs.s3a.auth.IAMInstanceCredentialsProvider;
 import org.apache.hadoop.fs.s3a.auth.NoAuthWithAWSException;
 import org.apache.hadoop.fs.s3a.auth.delegation.CountInvocationsProvider;
 import org.apache.hadoop.fs.s3a.impl.InstantiationIOException;
+import org.apache.hadoop.fs.s3a.test.PublicDatasetTestUtils;
 import org.apache.hadoop.io.retry.RetryPolicy;
 import org.apache.hadoop.util.Sets;
 
 import static org.apache.hadoop.fs.s3a.Constants.ASSUMED_ROLE_CREDENTIALS_PROVIDER;
 import static org.apache.hadoop.fs.s3a.Constants.AWS_CREDENTIALS_PROVIDER;
 import static org.apache.hadoop.fs.s3a.Constants.AWS_CREDENTIALS_PROVIDER_MAPPING;
-import static org.apache.hadoop.fs.s3a.S3ATestConstants.DEFAULT_CSVTEST_FILE;
 import static org.apache.hadoop.fs.s3a.S3ATestUtils.authenticationContains;
 import static org.apache.hadoop.fs.s3a.S3ATestUtils.buildClassListString;
-import static org.apache.hadoop.fs.s3a.S3ATestUtils.getCSVTestPath;
 import static org.apache.hadoop.fs.s3a.auth.CredentialProviderListFactory.STANDARD_AWS_PROVIDERS;
 import static org.apache.hadoop.fs.s3a.auth.CredentialProviderListFactory.buildAWSProviderList;
 import static org.apache.hadoop.fs.s3a.auth.CredentialProviderListFactory.createAWSCredentialProviderList;
 import static org.apache.hadoop.fs.s3a.impl.InstantiationIOException.DOES_NOT_IMPLEMENT;
+import static org.apache.hadoop.fs.s3a.test.PublicDatasetTestUtils.getExternalData;
 import static org.apache.hadoop.test.LambdaTestUtils.intercept;
 import static org.apache.hadoop.test.LambdaTestUtils.interceptFuture;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertNotNull;
-import static org.junit.Assert.assertTrue;
 
 /**
  * Unit tests for {@link Constants#AWS_CREDENTIALS_PROVIDER} logic.
  */
-public class TestS3AAWSCredentialsProvider {
+public class TestS3AAWSCredentialsProvider extends AbstractS3ATestBase {
 
   /**
-   * URI of the landsat images.
+   * URI of the test file: this must be anonymously accessible.
+   * As these are unit tests no actual connection to the store is made.
    */
   private static final URI TESTFILE_URI = new Path(
-      DEFAULT_CSVTEST_FILE).toUri();
+      PublicDatasetTestUtils.DEFAULT_EXTERNAL_FILE).toUri();
 
   private static final Logger LOG = LoggerFactory.getLogger(TestS3AAWSCredentialsProvider.class);
 
@@ -127,7 +124,7 @@ public class TestS3AAWSCredentialsProvider {
         TemporaryAWSCredentialsProvider.NAME
             + ", \t" + SimpleAWSCredentialsProvider.NAME
             + " ,\n " + AnonymousAWSCredentialsProvider.NAME);
-    Path testFile = getCSVTestPath(conf);
+    Path testFile = getExternalData(conf);
 
     AWSCredentialProviderList list = createAWSCredentialProviderList(
         testFile.toUri(), conf);
@@ -586,7 +583,7 @@ public class TestS3AAWSCredentialsProvider {
   @Test
   public void testConcurrentAuthentication() throws Throwable {
     Configuration conf = createProviderConfiguration(SlowProvider.class.getName());
-    Path testFile = getCSVTestPath(conf);
+    Path testFile = getExternalData(conf);
 
     AWSCredentialProviderList list = createAWSCredentialProviderList(testFile.toUri(), conf);
 
@@ -656,7 +653,7 @@ public class TestS3AAWSCredentialsProvider {
   @Test
   public void testConcurrentAuthenticationError() throws Throwable {
     Configuration conf = createProviderConfiguration(ErrorProvider.class.getName());
-    Path testFile = getCSVTestPath(conf);
+    Path testFile = getExternalData(conf);
 
     AWSCredentialProviderList list = createAWSCredentialProviderList(testFile.toUri(), conf);
     ErrorProvider provider = (ErrorProvider) list.getProviders().get(0);

+ 3 - 3
hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/adapter/TestV1CredentialsProvider.java

@@ -39,9 +39,9 @@ import org.apache.hadoop.fs.s3a.AWSCredentialProviderList;
 import org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider;
 import org.apache.hadoop.fs.s3a.auth.IAMInstanceCredentialsProvider;
 import org.apache.hadoop.fs.s3a.impl.InstantiationIOException;
+import org.apache.hadoop.fs.s3a.test.PublicDatasetTestUtils;
 
 import static org.apache.hadoop.fs.s3a.Constants.AWS_CREDENTIALS_PROVIDER;
-import static org.apache.hadoop.fs.s3a.S3ATestConstants.DEFAULT_CSVTEST_FILE;
 import static org.apache.hadoop.fs.s3a.auth.CredentialProviderListFactory.ANONYMOUS_CREDENTIALS_V1;
 import static org.apache.hadoop.fs.s3a.auth.CredentialProviderListFactory.EC2_CONTAINER_CREDENTIALS_V1;
 import static org.apache.hadoop.fs.s3a.auth.CredentialProviderListFactory.ENVIRONMENT_CREDENTIALS_V1;
@@ -56,10 +56,10 @@ import static org.junit.Assert.assertTrue;
 public class TestV1CredentialsProvider {
 
   /**
-   * URI of the landsat images.
+   * URI of the test file.
    */
   private static final URI TESTFILE_URI = new Path(
-      DEFAULT_CSVTEST_FILE).toUri();
+      PublicDatasetTestUtils.DEFAULT_EXTERNAL_FILE).toUri();
 
   private static final Logger LOG = LoggerFactory.getLogger(TestV1CredentialsProvider.class);
 

+ 2 - 2
hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/ITestAssumeRole.java

@@ -46,7 +46,6 @@ import org.apache.hadoop.fs.contract.ContractTestUtils;
 import org.apache.hadoop.fs.s3a.AWSBadRequestException;
 import org.apache.hadoop.fs.s3a.AbstractS3ATestBase;
 import org.apache.hadoop.fs.s3a.S3AFileSystem;
-import org.apache.hadoop.fs.s3a.S3ATestConstants;
 import org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider;
 import org.apache.hadoop.fs.s3a.commit.CommitConstants;
 import org.apache.hadoop.fs.s3a.commit.files.PendingSet;
@@ -68,6 +67,7 @@ import static org.apache.hadoop.fs.s3a.auth.RolePolicies.*;
 import static org.apache.hadoop.fs.s3a.auth.RoleTestUtils.forbidden;
 import static org.apache.hadoop.fs.s3a.auth.RoleTestUtils.newAssumedRoleConfig;
 import static org.apache.hadoop.fs.s3a.s3guard.S3GuardToolTestHelper.exec;
+import static org.apache.hadoop.fs.s3a.test.PublicDatasetTestUtils.requireAnonymousDataPath;
 import static org.apache.hadoop.fs.statistics.IOStatisticsLogging.ioStatisticsSourceToString;
 import static org.apache.hadoop.io.IOUtils.cleanupWithLogger;
 import static org.apache.hadoop.test.GenericTestUtils.assertExceptionContains;
@@ -115,7 +115,7 @@ public class ITestAssumeRole extends AbstractS3ATestBase {
   public void setup() throws Exception {
     super.setup();
     assumeRoleTests();
-    uri = new URI(S3ATestConstants.DEFAULT_CSVTEST_FILE);
+    uri = requireAnonymousDataPath(getConfiguration()).toUri();
   }
 
   @Override

+ 21 - 10
hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/delegation/ITestDelegatedMRJob.java

@@ -58,6 +58,8 @@ import static org.apache.hadoop.fs.s3a.auth.RoleTestUtils.probeForAssumedRoleARN
 import static org.apache.hadoop.fs.s3a.auth.delegation.DelegationConstants.*;
 import static org.apache.hadoop.fs.s3a.auth.delegation.MiniKerberizedHadoopCluster.assertSecurityEnabled;
 import static org.apache.hadoop.fs.s3a.auth.delegation.MiniKerberizedHadoopCluster.closeUserFileSystems;
+import static org.apache.hadoop.fs.s3a.test.PublicDatasetTestUtils.getOrcData;
+import static org.apache.hadoop.fs.s3a.test.PublicDatasetTestUtils.requireAnonymousDataPath;
 
 /**
  * Submit a job with S3 delegation tokens.
@@ -106,10 +108,17 @@ public class ITestDelegatedMRJob extends AbstractDelegationIT {
 
   private Path destPath;
 
-  private static final Path EXTRA_JOB_RESOURCE_PATH
-      = new Path("s3a://osm-pds/planet/planet-latest.orc");
+  /**
+   * Path of the extra job resource; set up in
+   * {@link #createConfiguration()}.
+   */
+  private Path extraJobResourcePath;
 
-  public static final URI jobResource = EXTRA_JOB_RESOURCE_PATH.toUri();
+  /**
+   * URI of the extra job resource; set up in
+   * {@link #createConfiguration()}.
+   */
+  private URI jobResourceUri;
 
   /**
    * Test array for parameterized test runs.
@@ -161,7 +170,9 @@ public class ITestDelegatedMRJob extends AbstractDelegationIT {
     conf.setInt(YarnConfiguration.RESOURCEMANAGER_CONNECT_RETRY_INTERVAL_MS,
         10_000);
 
-    String host = jobResource.getHost();
+    extraJobResourcePath = getOrcData(conf);
+    jobResourceUri = extraJobResourcePath.toUri();
+    String host = jobResourceUri.getHost();
     // and fix to the main endpoint if the caller has moved
     conf.set(
         String.format("fs.s3a.bucket.%s.endpoint", host), "");
@@ -229,9 +240,9 @@ public class ITestDelegatedMRJob extends AbstractDelegationIT {
 
   @Test
   public void testCommonCrawlLookup() throws Throwable {
-    FileSystem resourceFS = EXTRA_JOB_RESOURCE_PATH.getFileSystem(
+    FileSystem resourceFS = extraJobResourcePath.getFileSystem(
         getConfiguration());
-    FileStatus status = resourceFS.getFileStatus(EXTRA_JOB_RESOURCE_PATH);
+    FileStatus status = resourceFS.getFileStatus(extraJobResourcePath);
     LOG.info("Extra job resource is {}", status);
     assertTrue("Not encrypted: " + status, status.isEncrypted());
   }
@@ -241,9 +252,9 @@ public class ITestDelegatedMRJob extends AbstractDelegationIT {
     describe("Mock Job test");
     JobConf conf = new JobConf(getConfiguration());
 
-    // the input here is the landsat file; which lets
+    // the input here is the external file; which lets
     // us differentiate source URI from dest URI
-    Path input = new Path(DEFAULT_CSVTEST_FILE);
+    Path input = requireAnonymousDataPath(getConfiguration());
     final FileSystem sourceFS = input.getFileSystem(conf);
 
 
@@ -272,7 +283,7 @@ public class ITestDelegatedMRJob extends AbstractDelegationIT {
     // This is to actually stress the terasort code for which
     // the yarn ResourceLocalizationService was having problems with
     // fetching resources from.
-    URI partitionUri = new URI(EXTRA_JOB_RESOURCE_PATH.toString() +
+    URI partitionUri = new URI(extraJobResourcePath.toString() +
         "#_partition.lst");
     job.addCacheFile(partitionUri);
 
@@ -302,7 +313,7 @@ public class ITestDelegatedMRJob extends AbstractDelegationIT {
     // look up the destination token
     lookupToken(submittedCredentials, fs.getUri(), tokenKind);
     lookupToken(submittedCredentials,
-        EXTRA_JOB_RESOURCE_PATH.getFileSystem(conf).getUri(), tokenKind);
+        extraJobResourcePath.getFileSystem(conf).getUri(), tokenKind);
   }
 
 }

+ 2 - 3
hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/delegation/ITestRoleDelegationInFilesystem.java

@@ -53,8 +53,7 @@ public class ITestRoleDelegationInFilesystem extends
 
   /**
    * This verifies that the granted credentials only access the target bucket
-   * by using the credentials in a new S3 client to query the AWS-owned landsat
-   * bucket.
+   * by using the credentials in a new S3 client to query the public data bucket.
    * @param delegatedFS delegated FS with role-restricted access.
    * @throws Exception failure
    */
@@ -62,7 +61,7 @@ public class ITestRoleDelegationInFilesystem extends
   protected void verifyRestrictedPermissions(final S3AFileSystem delegatedFS)
       throws Exception {
     intercept(AccessDeniedException.class,
-        () -> readLandsatMetadata(delegatedFS));
+        () -> readExternalDatasetMetadata(delegatedFS));
   }
 
 }

+ 9 - 8
hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/delegation/ITestSessionDelegationInFilesystem.java

@@ -79,6 +79,7 @@ import static org.apache.hadoop.fs.s3a.auth.delegation.DelegationTokenIOExceptio
 import static org.apache.hadoop.fs.s3a.auth.delegation.MiniKerberizedHadoopCluster.ALICE;
 import static org.apache.hadoop.fs.s3a.auth.delegation.MiniKerberizedHadoopCluster.assertSecurityEnabled;
 import static org.apache.hadoop.fs.s3a.auth.delegation.S3ADelegationTokens.lookupS3ADelegationToken;
+import static org.apache.hadoop.fs.s3a.test.PublicDatasetTestUtils.requireAnonymousDataPath;
 import static org.apache.hadoop.test.LambdaTestUtils.doAs;
 import static org.apache.hadoop.test.LambdaTestUtils.intercept;
 import static org.hamcrest.Matchers.containsString;
@@ -344,7 +345,7 @@ public class ITestSessionDelegationInFilesystem extends AbstractDelegationIT {
     // TODO: Check what should happen here. Calling headObject() on the root path fails in V2,
     // with the error that key cannot be empty.
    // fs.getObjectMetadata(new Path("/"));
-    readLandsatMetadata(fs);
+    readExternalDatasetMetadata(fs);
 
     URI uri = fs.getUri();
     // create delegation tokens from the test suites FS.
@@ -463,13 +464,13 @@ public class ITestSessionDelegationInFilesystem extends AbstractDelegationIT {
   }
 
   /**
-   * Session tokens can read the landsat bucket without problems.
+   * Session tokens can read the external bucket without problems.
    * @param delegatedFS delegated FS
    * @throws Exception failure
    */
   protected void verifyRestrictedPermissions(final S3AFileSystem delegatedFS)
       throws Exception {
-    readLandsatMetadata(delegatedFS);
+    readExternalDatasetMetadata(delegatedFS);
   }
 
   @Test
@@ -582,7 +583,7 @@ public class ITestSessionDelegationInFilesystem extends AbstractDelegationIT {
 
   /**
    * This verifies that the granted credentials only access the target bucket
-   * by using the credentials in a new S3 client to query the AWS-owned landsat
+   * by using the credentials in a new S3 client to query the external
    * bucket.
    * @param delegatedFS delegated FS with role-restricted access.
    * @throws AccessDeniedException if the delegated FS's credentials can't
@@ -590,17 +591,17 @@ public class ITestSessionDelegationInFilesystem extends AbstractDelegationIT {
    * @return result of the HEAD
    * @throws Exception failure
    */
-  protected HeadBucketResponse readLandsatMetadata(final S3AFileSystem delegatedFS)
+  protected HeadBucketResponse readExternalDatasetMetadata(final S3AFileSystem delegatedFS)
       throws Exception {
     AWSCredentialProviderList testingCreds
         = delegatedFS.getS3AInternals().shareCredentials("testing");
 
-    URI landsat = new URI(DEFAULT_CSVTEST_FILE);
+    URI external = requireAnonymousDataPath(getConfiguration()).toUri();
     DefaultS3ClientFactory factory
         = new DefaultS3ClientFactory();
     Configuration conf = delegatedFS.getConf();
     factory.setConf(conf);
-    String host = landsat.getHost();
+    String host = external.getHost();
     S3ClientFactory.S3ClientCreationParameters parameters = null;
     parameters = new S3ClientFactory.S3ClientCreationParameters()
         .withCredentialSet(testingCreds)
@@ -609,7 +610,7 @@ public class ITestSessionDelegationInFilesystem extends AbstractDelegationIT {
             .newStatisticsFromAwsSdk())
         .withUserAgentSuffix("ITestSessionDelegationInFilesystem");
 
-    S3Client s3 = factory.createS3Client(landsat, parameters);
+    S3Client s3 = factory.createS3Client(external, parameters);
 
     return Invoker.once("HEAD", host,
         () -> s3.headBucket(b -> b.bucket(host)));

+ 8 - 8
hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/delegation/TestS3ADelegationTokenSupport.java

@@ -24,10 +24,10 @@ import org.junit.BeforeClass;
 import org.junit.Test;
 
 import org.apache.hadoop.fs.s3a.S3AEncryptionMethods;
-import org.apache.hadoop.fs.s3a.S3ATestConstants;
 import org.apache.hadoop.fs.s3a.S3ATestUtils;
 import org.apache.hadoop.fs.s3a.auth.MarshalledCredentialBinding;
 import org.apache.hadoop.fs.s3a.auth.MarshalledCredentials;
+import org.apache.hadoop.fs.s3a.test.PublicDatasetTestUtils;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.security.UserGroupInformation;
 import org.apache.hadoop.security.token.SecretManager;
@@ -44,11 +44,11 @@ import static org.junit.Assert.assertTrue;
  */
 public class TestS3ADelegationTokenSupport {
 
-  private static URI landsatUri;
+  private static URI externalUri;
 
   @BeforeClass
   public static void classSetup() throws Exception {
-    landsatUri = new URI(S3ATestConstants.DEFAULT_CSVTEST_FILE);
+    externalUri = new URI(PublicDatasetTestUtils.DEFAULT_EXTERNAL_FILE);
   }
 
   @Test
@@ -74,7 +74,7 @@ public class TestS3ADelegationTokenSupport {
         = new SessionTokenIdentifier(SESSION_TOKEN_KIND,
         alice,
         renewer,
-        new URI("s3a://landsat-pds/"),
+        new URI("s3a://anything/"),
         new MarshalledCredentials("a", "b", ""),
         new EncryptionSecrets(S3AEncryptionMethods.SSE_S3, ""),
         "origin");
@@ -116,7 +116,7 @@ public class TestS3ADelegationTokenSupport {
         SESSION_TOKEN_KIND,
         new Text(),
         renewer,
-        landsatUri,
+        externalUri,
         new MarshalledCredentials("a", "b", "c"),
         new EncryptionSecrets(), "");
 
@@ -135,7 +135,7 @@ public class TestS3ADelegationTokenSupport {
         SESSION_TOKEN_KIND,
         new Text(),
         null,
-        landsatUri,
+        externalUri,
         new MarshalledCredentials("a", "b", "c"),
         new EncryptionSecrets(), "");
 
@@ -151,7 +151,7 @@ public class TestS3ADelegationTokenSupport {
   @Test
   public void testRoleTokenIdentifierRoundTrip() throws Throwable {
     RoleTokenIdentifier id = new RoleTokenIdentifier(
-        landsatUri,
+        externalUri,
         new Text(),
         new Text(),
         new MarshalledCredentials("a", "b", "c"),
@@ -170,7 +170,7 @@ public class TestS3ADelegationTokenSupport {
   public void testFullTokenIdentifierRoundTrip() throws Throwable {
     Text renewer = new Text("renewerName");
     FullCredentialsTokenIdentifier id = new FullCredentialsTokenIdentifier(
-        landsatUri,
+        externalUri,
         new Text(),
         renewer,
         new MarshalledCredentials("a", "b", ""),

+ 2 - 1
hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/staging/TestPaths.java

@@ -26,6 +26,7 @@ import org.apache.hadoop.fs.LocalFileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.test.HadoopTestBase;
 
+import static org.apache.hadoop.fs.s3a.S3ATestConstants.UNIT_TEST_EXAMPLE_PATH;
 import static org.apache.hadoop.fs.s3a.commit.staging.Paths.*;
 import static org.apache.hadoop.test.LambdaTestUtils.intercept;
 
@@ -81,7 +82,7 @@ public class TestPaths extends HadoopTestBase {
     assertEquals("from " + path, expected, addUUID(path, "UUID"));
   }
 
-  private static final String DATA = "s3a://landsat-pds/data/";
+  private static final String DATA = UNIT_TEST_EXAMPLE_PATH;
   private static final Path BASE = new Path(DATA);
 
   @Test

+ 20 - 22
hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/ITestS3GuardTool.java

@@ -22,14 +22,17 @@ import java.io.BufferedReader;
 import java.io.ByteArrayInputStream;
 import java.io.ByteArrayOutputStream;
 import java.io.InputStreamReader;
+import java.net.URI;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
 
 import org.junit.Test;
 
+import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.s3a.S3AFileSystem;
+import org.apache.hadoop.fs.s3a.test.PublicDatasetTestUtils;
 import org.apache.hadoop.test.LambdaTestUtils;
 import org.apache.hadoop.util.StringUtils;
 
@@ -40,7 +43,6 @@ import static org.apache.hadoop.fs.s3a.MultipartTestUtils.assertNoUploadsAt;
 import static org.apache.hadoop.fs.s3a.MultipartTestUtils.clearAnyUploads;
 import static org.apache.hadoop.fs.s3a.MultipartTestUtils.countUploadsAt;
 import static org.apache.hadoop.fs.s3a.MultipartTestUtils.createPartUpload;
-import static org.apache.hadoop.fs.s3a.S3ATestUtils.getLandsatCSVFile;
 import static org.apache.hadoop.fs.s3a.S3ATestUtils.removeBaseAndBucketOverrides;
 import static org.apache.hadoop.fs.s3a.s3guard.S3GuardTool.BucketInfo;
 import static org.apache.hadoop.fs.s3a.s3guard.S3GuardTool.E_BAD_STATE;
@@ -57,36 +59,32 @@ public class ITestS3GuardTool extends AbstractS3GuardToolTestBase {
       "-force", "-verbose"};
 
   @Test
-  public void testLandsatBucketUnguarded() throws Throwable {
-    run(BucketInfo.NAME,
-        "-" + BucketInfo.UNGUARDED_FLAG,
-        getLandsatCSVFile(getConfiguration()));
-  }
-
-  @Test
-  public void testLandsatBucketRequireGuarded() throws Throwable {
-    runToFailure(E_BAD_STATE,
-        BucketInfo.NAME,
-        "-" + BucketInfo.GUARDED_FLAG,
-        getLandsatCSVFile(
-            ITestS3GuardTool.this.getConfiguration()));
-  }
-
-  @Test
-  public void testLandsatBucketRequireUnencrypted() throws Throwable {
+  public void testExternalBucketRequireUnencrypted() throws Throwable {
     removeBaseAndBucketOverrides(getConfiguration(), S3_ENCRYPTION_ALGORITHM);
     run(BucketInfo.NAME,
         "-" + BucketInfo.ENCRYPTION_FLAG, "none",
-        getLandsatCSVFile(getConfiguration()));
+        externalBucket());
+  }
+
+  /**
+   * Get the external bucket; this is of the default external file.
+   * If not set to the default value, the test will be skipped.
+   * @return the bucket of the default external file.
+   */
+  private String externalBucket() {
+    Configuration conf = getConfiguration();
+    Path result = PublicDatasetTestUtils.requireDefaultExternalData(conf);
+    final URI uri = result.toUri();
+    final String bucket = uri.getScheme() + "://" + uri.getHost();
+    return bucket;
   }
 
   @Test
-  public void testLandsatBucketRequireEncrypted() throws Throwable {
+  public void testExternalBucketRequireEncrypted() throws Throwable {
     runToFailure(E_BAD_STATE,
         BucketInfo.NAME,
         "-" + BucketInfo.ENCRYPTION_FLAG,
-        "AES256", getLandsatCSVFile(
-            ITestS3GuardTool.this.getConfiguration()));
+        "AES256", externalBucket());
   }
 
   @Test

+ 3 - 2
hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/TestAuthoritativePath.java

@@ -33,6 +33,7 @@ import org.apache.hadoop.fs.s3a.S3AUtils;
 import org.apache.hadoop.test.AbstractHadoopTestBase;
 
 import static org.apache.hadoop.fs.s3a.Constants.AUTHORITATIVE_PATH;
+import static org.apache.hadoop.fs.s3a.S3ATestConstants.UNIT_TEST_EXAMPLE_PATH;
 import static org.assertj.core.api.Assertions.assertThat;
 
 /**
@@ -71,7 +72,7 @@ public class TestAuthoritativePath extends AbstractHadoopTestBase {
   @Test
   public void testOtherBucket() throws Throwable {
     assertAuthPaths(l("/one/",
-        "s3a://landsat-pds/",
+        UNIT_TEST_EXAMPLE_PATH,
         BASE + "/two/"),
         "/one/", "/two/");
   }
@@ -79,7 +80,7 @@ public class TestAuthoritativePath extends AbstractHadoopTestBase {
   @Test
   public void testOtherScheme() throws Throwable {
     assertAuthPaths(l("/one/",
-        "s3a://landsat-pds/",
+         UNIT_TEST_EXAMPLE_PATH,
         "http://bucket/two/"),
         "/one/");
   }

+ 15 - 3
hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/scale/ITestS3AInputStreamPerformance.java

@@ -30,6 +30,7 @@ import org.apache.hadoop.fs.s3a.S3AInputPolicy;
 import org.apache.hadoop.fs.s3a.S3AInputStream;
 import org.apache.hadoop.fs.s3a.S3ATestUtils;
 import org.apache.hadoop.fs.s3a.statistics.S3AInputStreamStatistics;
+import org.apache.hadoop.fs.s3a.test.PublicDatasetTestUtils;
 import org.apache.hadoop.fs.statistics.IOStatistics;
 import org.apache.hadoop.fs.statistics.IOStatisticsSnapshot;
 import org.apache.hadoop.fs.statistics.MeanStatistic;
@@ -112,7 +113,9 @@ public class ITestS3AInputStreamPerformance extends S3AScaleTestBase {
     Configuration conf = getConf();
     conf.setInt(SOCKET_SEND_BUFFER, 16 * 1024);
     conf.setInt(SOCKET_RECV_BUFFER, 16 * 1024);
-    String testFile =  conf.getTrimmed(KEY_CSVTEST_FILE, DEFAULT_CSVTEST_FILE);
+    // look up the test file, no requirement to be set.
+    String testFile =  conf.getTrimmed(KEY_CSVTEST_FILE,
+        PublicDatasetTestUtils.DEFAULT_EXTERNAL_FILE);
     if (testFile.isEmpty()) {
       assumptionMessage = "Empty test property: " + KEY_CSVTEST_FILE;
       LOG.warn(assumptionMessage);
@@ -394,6 +397,9 @@ public class ITestS3AInputStreamPerformance extends S3AScaleTestBase {
     CompressionCodecFactory factory
         = new CompressionCodecFactory(getConf());
     CompressionCodec codec = factory.getCodec(testData);
+    Assertions.assertThat(codec)
+        .describedAs("No codec found for %s", testData)
+        .isNotNull();
     long bytesRead = 0;
     int lines = 0;
 
@@ -525,12 +531,18 @@ public class ITestS3AInputStreamPerformance extends S3AScaleTestBase {
     describe("Random IO with policy \"%s\"", policy);
     byte[] buffer = new byte[_1MB];
     long totalBytesRead = 0;
-
+    final long len = testDataStatus.getLen();
     in = openTestFile(policy, 0);
     ContractTestUtils.NanoTimer timer = new ContractTestUtils.NanoTimer();
     for (int[] action : RANDOM_IO_SEQUENCE) {
-      int position = action[0];
+      long position = action[0];
       int range = action[1];
+      // if a read goes past EOF, fail with details
+      // this will happen if the test datafile is too small.
+      Assertions.assertThat(position + range)
+          .describedAs("readFully(pos=%d range=%d) of %s",
+              position, range, testDataStatus)
+          .isLessThanOrEqualTo(len);
       in.readFully(position, buffer, 0, range);
       totalBytesRead += range;
     }

+ 14 - 45
hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/statistics/ITestAWSStatisticCollection.java

@@ -22,61 +22,30 @@ import org.junit.Test;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.fs.s3a.AbstractS3ATestBase;
-import org.apache.hadoop.fs.s3a.S3AFileSystem;
-import org.apache.hadoop.fs.statistics.IOStatistics;
+import org.apache.hadoop.fs.s3a.performance.AbstractS3ACostTest;
 
-import static org.apache.hadoop.fs.s3a.Constants.DEFAULT_ENDPOINT;
-import static org.apache.hadoop.fs.s3a.Constants.ENDPOINT;
-import static org.apache.hadoop.fs.s3a.S3ATestUtils.getLandsatCSVPath;
+import static org.apache.hadoop.fs.s3a.Constants.FS_S3A_CREATE_PERFORMANCE;
 import static org.apache.hadoop.fs.s3a.Statistic.STORE_IO_REQUEST;
-import static org.apache.hadoop.fs.statistics.IOStatisticAssertions.assertThatStatisticCounter;
 
 /**
  * Verify that AWS SDK statistics are wired up.
- * This test tries to read data from US-east-1 and us-west-2 buckets
- * so as to be confident that the nuances of region mapping
- * are handed correctly (HADOOP-13551).
- * The statistics are probed to verify that the wiring up is complete.
  */
-public class ITestAWSStatisticCollection extends AbstractS3ATestBase {
+public class ITestAWSStatisticCollection extends AbstractS3ACostTest {
 
-  private static final Path COMMON_CRAWL_PATH
-      = new Path("s3a://osm-pds/planet/planet-latest.orc");
-
-  @Test
-  public void testLandsatStatistics() throws Throwable {
-    final Configuration conf = getConfiguration();
-    // skips the tests if the landsat path isn't the default.
-    Path path = getLandsatCSVPath(conf);
-    conf.set(ENDPOINT, DEFAULT_ENDPOINT);
-    conf.unset("fs.s3a.bucket.landsat-pds.endpoint");
-
-    try (S3AFileSystem fs = (S3AFileSystem) path.getFileSystem(conf)) {
-      fs.getS3AInternals().getObjectMetadata(path);
-      IOStatistics iostats = fs.getIOStatistics();
-      assertThatStatisticCounter(iostats,
-          STORE_IO_REQUEST.getSymbol())
-          .isGreaterThanOrEqualTo(1);
-    }
+  @Override
+  public Configuration createConfiguration() {
+    final Configuration conf = super.createConfiguration();
+    conf.setBoolean(FS_S3A_CREATE_PERFORMANCE, true);
+    return conf;
   }
 
   @Test
-  public void testCommonCrawlStatistics() throws Throwable {
-    final Configuration conf = getConfiguration();
-    // skips the tests if the landsat path isn't the default.
-    getLandsatCSVPath(conf);
-
-    Path path = COMMON_CRAWL_PATH;
-    conf.set(ENDPOINT, DEFAULT_ENDPOINT);
-
-    try (S3AFileSystem fs = (S3AFileSystem) path.getFileSystem(conf)) {
-      fs.getS3AInternals().getObjectMetadata(path);
-      IOStatistics iostats = fs.getIOStatistics();
-      assertThatStatisticCounter(iostats,
-          STORE_IO_REQUEST.getSymbol())
-          .isGreaterThanOrEqualTo(1);
-    }
+  public void testSDKMetricsCostOfGetFileStatusOnFile() throws Throwable {
+    describe("performing getFileStatus on a file");
+    Path simpleFile = file(methodPath());
+    // and repeat on the file looking at AWS wired up stats
+    verifyMetrics(() -> getFileSystem().getFileStatus(simpleFile),
+        with(STORE_IO_REQUEST, 1));
   }
 
 }

+ 82 - 0
hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/test/PublicDatasetTestUtils.java

@@ -18,9 +18,13 @@
 
 package org.apache.hadoop.fs.s3a.test;
 
+import org.junit.Assume;
+
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.classification.InterfaceStability;
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.s3a.S3ATestConstants;
 import org.apache.hadoop.fs.s3a.S3ATestUtils;
 
 import static org.apache.hadoop.fs.s3a.S3ATestConstants.KEY_BUCKET_WITH_MANY_OBJECTS;
@@ -69,6 +73,77 @@ public final class PublicDatasetTestUtils {
   private static final String DEFAULT_BUCKET_WITH_MANY_OBJECTS
       = "s3a://usgs-landsat/collection02/level-1/";
 
+  /**
+   * ORC dataset: {@value}.
+   */
+  private static final Path ORC_DATA = new Path("s3a://osm-pds/planet/planet-latest.orc");
+
+  /**
+   * Provide a Path for some ORC data.
+   *
+   * @param conf Hadoop configuration
+   * @return S3A FS URI
+   */
+  public static Path getOrcData(Configuration conf) {
+    return ORC_DATA;
+  }
+
+  /**
+   * Default path for the external test file: {@value}.
+   * This must be: gzipped, large enough for the performance
+   * tests and in a read-only bucket with anonymous access.
+   * */
+  public static final String DEFAULT_EXTERNAL_FILE =
+      "s3a://noaa-cors-pds/raw/2023/017/ohfh/OHFH017d.23_.gz";
+
+  /**
+   * Get the external test file.
+   * <p>
+   * This must be: gzipped, large enough for the performance
+   * tests and in a read-only bucket with anon
+   * @param conf configuration
+   * @return a dataset which meets the requirements.
+   */
+  public static Path getExternalData(Configuration conf) {
+    return new Path(fetchFromConfig(conf,
+        S3ATestConstants.KEY_CSVTEST_FILE, DEFAULT_EXTERNAL_FILE));
+  }
+
+  /**
+   * Get the anonymous dataset..
+   * @param conf configuration
+   * @return a dataset which supports anonymous access.
+   */
+  public static Path requireAnonymousDataPath(Configuration conf) {
+    return requireDefaultExternalData(conf);
+  }
+
+
+  /**
+   * Get the external test file; assume() that it is not modified (i.e. we haven't
+   * switched to a new storage infrastructure where the bucket is no longer
+   * read only).
+   * @return test file.
+   * @param conf test configuration
+   */
+  public static String requireDefaultExternalDataFile(Configuration conf) {
+    String filename = getExternalData(conf).toUri().toString();
+    Assume.assumeTrue("External test file is not the default",
+        DEFAULT_EXTERNAL_FILE.equals(filename));
+    return filename;
+  }
+
+  /**
+   * Get the test external file; assume() that it is not modified (i.e. we haven't
+   * switched to a new storage infrastructure where the bucket is no longer
+   * read only).
+   * @param conf test configuration
+   * @return test file as a path.
+   */
+  public static Path requireDefaultExternalData(Configuration conf) {
+    return new Path(requireDefaultExternalDataFile(conf));
+  }
+
   /**
    * Provide a URI for a directory containing many objects.
    *
@@ -97,6 +172,13 @@ public final class PublicDatasetTestUtils {
         KEY_REQUESTER_PAYS_FILE, DEFAULT_REQUESTER_PAYS_FILE);
   }
 
+  /**
+   * Fetch a trimmed configuration value, require it to to be non-empty.
+   * @param conf configuration file
+   * @param key key
+   * @param defaultValue default value.
+   * @return the resolved value.
+   */
   private static String fetchFromConfig(Configuration conf, String key, String defaultValue) {
     String value = conf.getTrimmed(key, defaultValue);
 

+ 30 - 10
hadoop-tools/hadoop-aws/src/test/resources/core-site.xml

@@ -30,37 +30,57 @@
     <final>false</final>
   </property>
 
-  <!-- Per-bucket configurations: landsat-pds -->
   <!--
+    Test file for some scale tests.
+
     A CSV file in this bucket was used for testing S3 select.
     Although this feature has been removed, (HADOOP-18830)
     it is still used in some tests as a large file to read
-    in a bucket without write permissions.
-    These tests do not need a CSV file.
+    and as a file in a bucket without write permissions.
+    The original file s3a://landsat-pds/scene_list.gz is
+    on a now-inaccessible bucket.
   -->
+<!--
+  This is defined in PublicDatasetTestUtils;
+  if needed for older builds, this can copied into
+  auth-keys along with the other bucket binding information,
+  which is all exclusively defined here.
+
   <property>
-    <name>fs.s3a.bucket.landsat-pds.endpoint.region</name>
-    <value>us-west-2</value>
-    <description>The region for s3a://landsat-pds</description>
+    <name>fs.s3a.scale.test.csvfile</name>
+    <value>s3a://noaa-cors-pds/raw/2024/001/akse/AKSE001x.24_.gz</value>
+    <description>file used in scale tests</description>
   </property>
+-->
 
   <property>
-    <name>fs.s3a.bucket.landsat-pds.multipart.purge</name>
+    <name>fs.s3a.bucket.noaa-cors-pds.endpoint.region</name>
+    <value>us-east-1</value>
+  </property>
+
+  <property>
+    <name>fs.s3a.bucket.noaa-isd-pds.multipart.purge</name>
     <value>false</value>
     <description>Don't try to purge uploads in the read-only bucket, as
     it will only create log noise.</description>
   </property>
 
   <property>
-    <name>fs.s3a.bucket.landsat-pds.probe</name>
+    <name>fs.s3a.bucket.noaa-isd-pds.probe</name>
     <value>0</value>
     <description>Let's postpone existence checks to the first IO operation </description>
   </property>
 
   <property>
-    <name>fs.s3a.bucket.landsat-pds.audit.add.referrer.header</name>
+    <name>fs.s3a.bucket.noaa-isd-pds.audit.add.referrer.header</name>
     <value>false</value>
-    <description>Do not add the referrer header to landsat operations</description>
+    <description>Do not add the referrer header</description>
+  </property>
+
+  <property>
+    <name>fs.s3a.bucket.noaa-isd-pds.prefetch.block.size</name>
+    <value>128k</value>
+    <description>Use a small prefetch size so tests fetch multiple blocks</description>
   </property>
 
   <!-- Per-bucket configurations: usgs-landsat -->