Browse Source

HADOOP-19057. S3A: Landsat bucket used in tests no longer accessible (#6515)

The AWS landsat data previously used in some S3A tests is no
longer accessible

This PR moves to the new external file
s3a://noaa-cors-pds/raw/2024/001/akse/AKSE001x.24_.gz

* Large enough file for scale tests
* Bucket supports anonymous access
* Ends in .gz to keep codec tests happy
* No spaces in path to keep bucket-info happy

Test Code Changes
* Leaves the test key name alone: fs.s3a.scale.test.csvfile
* Rename all methods and fields move remove "csv" from their names and
  move to "external file" we no longer require it to be CSV.
* Path definition and helper methods have been moved to PublicDatasetTestUtils
* Improve error reporting in ITestS3AInputStreamPerformance if the file
  is too short
  
With S3 Select removed, there is no need for the file to be
a CSV file; there is a test which tries to unzip it; other
tests have a minimum file size.

Consult the JIRA for the settings to add to auth-keys.xml
to switch earlier builds to this same file.

Contributed by Steve Loughran
Steve Loughran 1 year ago
parent
commit
7651afd3db
30 changed files with 362 additions and 298 deletions
  1. 1 1
      hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/assumed_roles.md
  2. 4 4
      hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/auditing.md
  3. 3 30
      hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/committers.md
  4. 4 5
      hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/connecting.md
  5. 1 1
      hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/delegation_token_architecture.md
  6. 7 7
      hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/delegation_tokens.md
  7. 14 16
      hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/directory_markers.md
  8. 7 8
      hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/encryption.md
  9. 3 3
      hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/index.md
  10. 4 3
      hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/performance.md
  11. 11 13
      hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/testing.md
  12. 2 2
      hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AAWSCredentialsProvider.java
  13. 14 23
      hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AFailureHandling.java
  14. 47 38
      hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3APrefetchingCacheFiles.java
  15. 6 4
      hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/S3ATestConstants.java
  16. 14 11
      hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/S3ATestUtils.java
  17. 9 12
      hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/TestS3AAWSCredentialsProvider.java
  18. 3 3
      hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/adapter/TestV1CredentialsProvider.java
  19. 2 2
      hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/ITestAssumeRole.java
  20. 21 10
      hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/delegation/ITestDelegatedMRJob.java
  21. 2 3
      hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/delegation/ITestRoleDelegationInFilesystem.java
  22. 9 8
      hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/delegation/ITestSessionDelegationInFilesystem.java
  23. 8 8
      hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/delegation/TestS3ADelegationTokenSupport.java
  24. 2 1
      hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/staging/TestPaths.java
  25. 20 22
      hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/ITestS3GuardTool.java
  26. 3 2
      hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/TestAuthoritativePath.java
  27. 15 3
      hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/scale/ITestS3AInputStreamPerformance.java
  28. 14 45
      hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/statistics/ITestAWSStatisticCollection.java
  29. 82 0
      hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/test/PublicDatasetTestUtils.java
  30. 30 10
      hadoop-tools/hadoop-aws/src/test/resources/core-site.xml

+ 1 - 1
hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/assumed_roles.md

@@ -585,7 +585,7 @@ If an operation fails with an `AccessDeniedException`, then the role does not ha
 the permission for the S3 Operation invoked during the call.
 the permission for the S3 Operation invoked during the call.
 
 
 ```
 ```
-> hadoop fs -touch  s3a://landsat-pds/a
+> hadoop fs -touch  s3a://noaa-isd-pds/a
 
 
 java.nio.file.AccessDeniedException: a: Writing Object on a:
 java.nio.file.AccessDeniedException: a: Writing Object on a:
  software.amazon.awssdk.services.s3.model.S3Exception: Access Denied
  software.amazon.awssdk.services.s3.model.S3Exception: Access Denied

+ 4 - 4
hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/auditing.md

@@ -111,9 +111,9 @@ Specific buckets can have auditing disabled, even when it is enabled globally.
 
 
 ```xml
 ```xml
 <property>
 <property>
-  <name>fs.s3a.bucket.landsat-pds.audit.enabled</name>
+  <name>fs.s3a.bucket.noaa-isd-pds.audit.enabled</name>
   <value>false</value>
   <value>false</value>
-  <description>Do not audit landsat bucket operations</description>
+  <description>Do not audit bucket operations</description>
 </property>
 </property>
 ```
 ```
 
 
@@ -342,9 +342,9 @@ either globally or for specific buckets:
 </property>
 </property>
 
 
 <property>
 <property>
-  <name>fs.s3a.bucket.landsat-pds.audit.referrer.enabled</name>
+  <name>fs.s3a.bucket.noaa-isd-pds.audit.referrer.enabled</name>
   <value>false</value>
   <value>false</value>
-  <description>Do not add the referrer header to landsat operations</description>
+  <description>Do not add the referrer header to operations</description>
 </property>
 </property>
 ```
 ```
 
 

+ 3 - 30
hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/committers.md

@@ -747,7 +747,7 @@ For example, for any job executed through Hadoop MapReduce, the Job ID can be us
 ### `Filesystem does not have support for 'magic' committer`
 ### `Filesystem does not have support for 'magic' committer`
 
 
 ```
 ```
-org.apache.hadoop.fs.s3a.commit.PathCommitException: `s3a://landsat-pds': Filesystem does not have support for 'magic' committer enabled
+org.apache.hadoop.fs.s3a.commit.PathCommitException: `s3a://noaa-isd-pds': Filesystem does not have support for 'magic' committer enabled
 in configuration option fs.s3a.committer.magic.enabled
 in configuration option fs.s3a.committer.magic.enabled
 ```
 ```
 
 
@@ -760,42 +760,15 @@ Remove all global/per-bucket declarations of `fs.s3a.bucket.magic.enabled` or se
 
 
 ```xml
 ```xml
 <property>
 <property>
-  <name>fs.s3a.bucket.landsat-pds.committer.magic.enabled</name>
+  <name>fs.s3a.bucket.noaa-isd-pds.committer.magic.enabled</name>
   <value>true</value>
   <value>true</value>
 </property>
 </property>
 ```
 ```
 
 
 Tip: you can verify that a bucket supports the magic committer through the
 Tip: you can verify that a bucket supports the magic committer through the
-`hadoop s3guard bucket-info` command:
+`hadoop s3guard bucket-info` command.
 
 
 
 
-```
-> hadoop s3guard bucket-info -magic s3a://landsat-pds/
-Location: us-west-2
-
-S3A Client
-        Signing Algorithm: fs.s3a.signing-algorithm=(unset)
-        Endpoint: fs.s3a.endpoint=s3.amazonaws.com
-        Encryption: fs.s3a.encryption.algorithm=none
-        Input seek policy: fs.s3a.experimental.input.fadvise=normal
-        Change Detection Source: fs.s3a.change.detection.source=etag
-        Change Detection Mode: fs.s3a.change.detection.mode=server
-
-S3A Committers
-        The "magic" committer is supported in the filesystem
-        S3A Committer factory class: mapreduce.outputcommitter.factory.scheme.s3a=org.apache.hadoop.fs.s3a.commit.S3ACommitterFactory
-        S3A Committer name: fs.s3a.committer.name=magic
-        Store magic committer integration: fs.s3a.committer.magic.enabled=true
-
-Security
-        Delegation token support is disabled
-
-Directory Markers
-        The directory marker policy is "keep"
-        Available Policies: delete, keep, authoritative
-        Authoritative paths: fs.s3a.authoritative.path=```
-```
-
 ### Error message: "File being created has a magic path, but the filesystem has magic file support disabled"
 ### Error message: "File being created has a magic path, but the filesystem has magic file support disabled"
 
 
 A file is being written to a path which is used for "magic" files,
 A file is being written to a path which is used for "magic" files,

+ 4 - 5
hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/connecting.md

@@ -284,14 +284,13 @@ a bucket.
 The up to date list of regions is [Available online](https://docs.aws.amazon.com/general/latest/gr/s3.html).
 The up to date list of regions is [Available online](https://docs.aws.amazon.com/general/latest/gr/s3.html).
 
 
 This list can be used to specify the endpoint of individual buckets, for example
 This list can be used to specify the endpoint of individual buckets, for example
-for buckets in the central and EU/Ireland endpoints.
+for buckets in the us-west-2 and EU/Ireland endpoints.
 
 
 
 
 ```xml
 ```xml
 <property>
 <property>
-  <name>fs.s3a.bucket.landsat-pds.endpoint.region</name>
+  <name>fs.s3a.bucket.us-west-2-dataset.endpoint.region</name>
   <value>us-west-2</value>
   <value>us-west-2</value>
-  <description>The region for s3a://landsat-pds URLs</description>
 </property>
 </property>
 
 
 <property>
 <property>
@@ -354,9 +353,9 @@ The boolean option `fs.s3a.endpoint.fips` (default `false`) switches the S3A con
 For a single bucket:
 For a single bucket:
 ```xml
 ```xml
 <property>
 <property>
-  <name>fs.s3a.bucket.landsat-pds.endpoint.fips</name>
+  <name>fs.s3a.bucket.noaa-isd-pds.endpoint.fips</name>
   <value>true</value>
   <value>true</value>
-  <description>Use the FIPS endpoint for the landsat dataset</description>
+  <description>Use the FIPS endpoint for the NOAA dataset</description>
 </property>
 </property>
 ```
 ```
 
 

+ 1 - 1
hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/delegation_token_architecture.md

@@ -188,7 +188,7 @@ If it was deployed unbonded, the DT Binding is asked to create a new DT.
 
 
 It is up to the binding what it includes in the token identifier, and how it obtains them.
 It is up to the binding what it includes in the token identifier, and how it obtains them.
 This new token identifier is included in a token which has a "canonical service name" of
 This new token identifier is included in a token which has a "canonical service name" of
-the URI of the filesystem (e.g "s3a://landsat-pds").
+the URI of the filesystem (e.g "s3a://noaa-isd-pds").
 
 
 The issued/reissued token identifier can be marshalled and reused.
 The issued/reissued token identifier can be marshalled and reused.
 
 

+ 7 - 7
hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/delegation_tokens.md

@@ -481,8 +481,8 @@ This will fetch the token and save it to the named file (here, `tokens.bin`),
 even if Kerberos is disabled.
 even if Kerberos is disabled.
 
 
 ```bash
 ```bash
-# Fetch a token for the AWS landsat-pds bucket and save it to tokens.bin
-$ hdfs fetchdt --webservice s3a://landsat-pds/  tokens.bin
+# Fetch a token for the AWS noaa-isd-pds bucket and save it to tokens.bin
+$ hdfs fetchdt --webservice s3a://noaa-isd-pds/ tokens.bin
 ```
 ```
 
 
 If the command fails with `ERROR: Failed to fetch token` it means the
 If the command fails with `ERROR: Failed to fetch token` it means the
@@ -498,11 +498,11 @@ host on which it was created.
 ```bash
 ```bash
 $ bin/hdfs fetchdt --print tokens.bin
 $ bin/hdfs fetchdt --print tokens.bin
 
 
-Token (S3ATokenIdentifier{S3ADelegationToken/Session; uri=s3a://landsat-pds;
+Token (S3ATokenIdentifier{S3ADelegationToken/Session; uri=s3a://noaa-isd-pds;
 timestamp=1541683947569; encryption=EncryptionSecrets{encryptionMethod=SSE_S3};
 timestamp=1541683947569; encryption=EncryptionSecrets{encryptionMethod=SSE_S3};
 Created on vm1.local/192.168.99.1 at time 2018-11-08T13:32:26.381Z.};
 Created on vm1.local/192.168.99.1 at time 2018-11-08T13:32:26.381Z.};
 Session credentials for user AAABWL expires Thu Nov 08 14:02:27 GMT 2018; (valid))
 Session credentials for user AAABWL expires Thu Nov 08 14:02:27 GMT 2018; (valid))
-for s3a://landsat-pds
+for s3a://noaa-isd-pds
 ```
 ```
 The "(valid)" annotation means that the AWS credentials are considered "valid":
 The "(valid)" annotation means that the AWS credentials are considered "valid":
 there is both a username and a secret.
 there is both a username and a secret.
@@ -513,11 +513,11 @@ If delegation support is enabled, it also prints the current
 hadoop security level.
 hadoop security level.
 
 
 ```bash
 ```bash
-$ hadoop s3guard bucket-info s3a://landsat-pds/
+$ hadoop s3guard bucket-info s3a://noaa-isd-pds/
 
 
-Filesystem s3a://landsat-pds
+Filesystem s3a://noaa-isd-pds
 Location: us-west-2
 Location: us-west-2
-Filesystem s3a://landsat-pds is not using S3Guard
+Filesystem s3a://noaa-isd-pds is not using S3Guard
 The "magic" committer is not supported
 The "magic" committer is not supported
 
 
 S3A Client
 S3A Client

+ 14 - 16
hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/directory_markers.md

@@ -314,9 +314,8 @@ All releases of Hadoop which have been updated to be marker aware will support t
 Example: `s3guard bucket-info -markers aware` on a compatible release.
 Example: `s3guard bucket-info -markers aware` on a compatible release.
 
 
 ```
 ```
-> hadoop s3guard bucket-info -markers aware s3a://landsat-pds/
-Filesystem s3a://landsat-pds
-Location: us-west-2
+> hadoop s3guard bucket-info -markers aware s3a://noaa-isd-pds/
+Filesystem s3a://noaa-isd-pds
 
 
 ...
 ...
 
 
@@ -326,13 +325,14 @@ Directory Markers
         Authoritative paths: fs.s3a.authoritative.path=
         Authoritative paths: fs.s3a.authoritative.path=
         The S3A connector is compatible with buckets where directory markers are not deleted
         The S3A connector is compatible with buckets where directory markers are not deleted
 
 
+...
 ```
 ```
 
 
 The same command will fail on older releases, because the `-markers` option
 The same command will fail on older releases, because the `-markers` option
 is unknown
 is unknown
 
 
 ```
 ```
-> hadoop s3guard bucket-info -markers aware s3a://landsat-pds/
+> hadoop s3guard bucket-info -markers aware s3a://noaa-isd-pds/
 Illegal option -markers
 Illegal option -markers
 Usage: hadoop bucket-info [OPTIONS] s3a://BUCKET
 Usage: hadoop bucket-info [OPTIONS] s3a://BUCKET
     provide/check information about a specific bucket
     provide/check information about a specific bucket
@@ -354,9 +354,8 @@ Generic options supported are:
 A specific policy check verifies that the connector is configured as desired
 A specific policy check verifies that the connector is configured as desired
 
 
 ```
 ```
-> hadoop s3guard bucket-info -markers keep s3a://landsat-pds/
-Filesystem s3a://landsat-pds
-Location: us-west-2
+> hadoop s3guard bucket-info -markers keep s3a://noaa-isd-pds/
+Filesystem s3a://noaa-isd-pds
 
 
 ...
 ...
 
 
@@ -371,9 +370,8 @@ When probing for a specific policy, the error code "46" is returned if the activ
 does not match that requested:
 does not match that requested:
 
 
 ```
 ```
-> hadoop s3guard bucket-info -markers delete s3a://landsat-pds/
-Filesystem s3a://landsat-pds
-Location: us-west-2
+> hadoop s3guard bucket-info -markers delete s3a://noaa-isd-pds/
+Filesystem s3a://noaa-isd-pds
 
 
 S3A Client
 S3A Client
         Signing Algorithm: fs.s3a.signing-algorithm=(unset)
         Signing Algorithm: fs.s3a.signing-algorithm=(unset)
@@ -398,7 +396,7 @@ Directory Markers
         Authoritative paths: fs.s3a.authoritative.path=
         Authoritative paths: fs.s3a.authoritative.path=
 
 
 2021-11-22 16:03:59,175 [main] INFO  util.ExitUtil (ExitUtil.java:terminate(210))
 2021-11-22 16:03:59,175 [main] INFO  util.ExitUtil (ExitUtil.java:terminate(210))
- -Exiting with status 46: 46: Bucket s3a://landsat-pds: required marker polic is
+ -Exiting with status 46: 46: Bucket s3a://noaa-isd-pds: required marker polic is
   "keep" but actual policy is "delete"
   "keep" but actual policy is "delete"
 
 
 ```
 ```
@@ -450,10 +448,10 @@ Audit the path and fail if any markers were found.
 
 
 
 
 ```
 ```
-> hadoop s3guard markers -limit 8000 -audit s3a://landsat-pds/
+> hadoop s3guard markers -limit 8000 -audit s3a://noaa-isd-pds/
 
 
-The directory marker policy of s3a://landsat-pds is "Keep"
-2020-08-05 13:42:56,079 [main] INFO  tools.MarkerTool (DurationInfo.java:<init>(77)) - Starting: marker scan s3a://landsat-pds/
+The directory marker policy of s3a://noaa-isd-pds is "Keep"
+2020-08-05 13:42:56,079 [main] INFO  tools.MarkerTool (DurationInfo.java:<init>(77)) - Starting: marker scan s3a://noaa-isd-pds/
 Scanned 1,000 objects
 Scanned 1,000 objects
 Scanned 2,000 objects
 Scanned 2,000 objects
 Scanned 3,000 objects
 Scanned 3,000 objects
@@ -463,8 +461,8 @@ Scanned 6,000 objects
 Scanned 7,000 objects
 Scanned 7,000 objects
 Scanned 8,000 objects
 Scanned 8,000 objects
 Limit of scan reached - 8,000 objects
 Limit of scan reached - 8,000 objects
-2020-08-05 13:43:01,184 [main] INFO  tools.MarkerTool (DurationInfo.java:close(98)) - marker scan s3a://landsat-pds/: duration 0:05.107s
-No surplus directory markers were found under s3a://landsat-pds/
+2020-08-05 13:43:01,184 [main] INFO  tools.MarkerTool (DurationInfo.java:close(98)) - marker scan s3a://noaa-isd-pds/: duration 0:05.107s
+No surplus directory markers were found under s3a://noaa-isd-pds/
 Listing limit reached before completing the scan
 Listing limit reached before completing the scan
 2020-08-05 13:43:01,187 [main] INFO  util.ExitUtil (ExitUtil.java:terminate(210)) - Exiting with status 3:
 2020-08-05 13:43:01,187 [main] INFO  util.ExitUtil (ExitUtil.java:terminate(210)) - Exiting with status 3:
 ```
 ```

+ 7 - 8
hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/encryption.md

@@ -616,15 +616,14 @@ header.x-amz-version-id="KcDOVmznIagWx3gP1HlDqcZvm1mFWZ2a"
 A file with no-encryption (on a bucket without versioning but with intelligent tiering):
 A file with no-encryption (on a bucket without versioning but with intelligent tiering):
 
 
 ```
 ```
-bin/hadoop fs -getfattr -d s3a://landsat-pds/scene_list.gz
+ bin/hadoop fs -getfattr -d s3a://noaa-cors-pds/raw/2024/001/akse/AKSE001x.24_.gz
 
 
-# file: s3a://landsat-pds/scene_list.gz
-header.Content-Length="45603307"
-header.Content-Type="application/octet-stream"
-header.ETag="39c34d489777a595b36d0af5726007db"
-header.Last-Modified="Wed Aug 29 01:45:15 BST 2018"
-header.x-amz-storage-class="INTELLIGENT_TIERING"
-header.x-amz-version-id="null"
+# file: s3a://noaa-cors-pds/raw/2024/001/akse/AKSE001x.24_.gz
+header.Content-Length="524671"
+header.Content-Type="binary/octet-stream"
+header.ETag=""3e39531220fbd3747d32cf93a79a7a0c""
+header.Last-Modified="Tue Jan 02 00:15:13 GMT 2024"
+header.x-amz-server-side-encryption="AES256"
 ```
 ```
 
 
 ###<a name="changing-encryption"></a> Use `rename()` to encrypt files with new keys
 ###<a name="changing-encryption"></a> Use `rename()` to encrypt files with new keys

+ 3 - 3
hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/index.md

@@ -503,7 +503,7 @@ explicitly opened up for broader access.
 ```bash
 ```bash
 hadoop fs -ls \
 hadoop fs -ls \
  -D fs.s3a.aws.credentials.provider=org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider \
  -D fs.s3a.aws.credentials.provider=org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider \
- s3a://landsat-pds/
+ s3a://noaa-isd-pds/
 ```
 ```
 
 
 1. Allowing anonymous access to an S3 bucket compromises
 1. Allowing anonymous access to an S3 bucket compromises
@@ -1630,11 +1630,11 @@ a session key:
 </property>
 </property>
 ```
 ```
 
 
-Finally, the public `s3a://landsat-pds/` bucket can be accessed anonymously:
+Finally, the public `s3a://noaa-isd-pds/` bucket can be accessed anonymously:
 
 
 ```xml
 ```xml
 <property>
 <property>
-  <name>fs.s3a.bucket.landsat-pds.aws.credentials.provider</name>
+  <name>fs.s3a.bucket.noaa-isd-pds.aws.credentials.provider</name>
   <value>org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider</value>
   <value>org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider</value>
 </property>
 </property>
 ```
 ```

+ 4 - 3
hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/performance.md

@@ -447,7 +447,8 @@ An example of this is covered in [HADOOP-13871](https://issues.apache.org/jira/b
 
 
 1. For public data, use `curl`:
 1. For public data, use `curl`:
 
 
-        curl -O https://landsat-pds.s3.amazonaws.com/scene_list.gz
+        curl -O https://noaa-cors-pds.s3.amazonaws.com/raw/2023/001/akse/AKSE001a.23_.gz
+
 1. Use `nettop` to monitor a processes connections.
 1. Use `nettop` to monitor a processes connections.
 
 
 
 
@@ -696,7 +697,7 @@ via `FileSystem.get()` or `Path.getFileSystem()`.
 The cache, `FileSystem.CACHE` will, for each user, cachec one instance of a filesystem
 The cache, `FileSystem.CACHE` will, for each user, cachec one instance of a filesystem
 for a given URI.
 for a given URI.
 All calls to `FileSystem.get` for a cached FS for a URI such
 All calls to `FileSystem.get` for a cached FS for a URI such
-as `s3a://landsat-pds/` will return that singe single instance.
+as `s3a://noaa-isd-pds/` will return that singe single instance.
 
 
 FileSystem instances are created on-demand for the cache,
 FileSystem instances are created on-demand for the cache,
 and will be done in each thread which requests an instance.
 and will be done in each thread which requests an instance.
@@ -720,7 +721,7 @@ can be created simultaneously for different object stores/distributed
 filesystems.
 filesystems.
 
 
 For example, a value of four would put an upper limit on the number
 For example, a value of four would put an upper limit on the number
-of wasted instantiations of a connector for the `s3a://landsat-pds/`
+of wasted instantiations of a connector for the `s3a://noaa-isd-pds/`
 bucket.
 bucket.
 
 
 ```xml
 ```xml

+ 11 - 13
hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/testing.md

@@ -260,22 +260,20 @@ define the target region in `auth-keys.xml`.
 ### <a name="csv"></a> CSV Data Tests
 ### <a name="csv"></a> CSV Data Tests
 
 
 The `TestS3AInputStreamPerformance` tests require read access to a multi-MB
 The `TestS3AInputStreamPerformance` tests require read access to a multi-MB
-text file. The default file for these tests is one published by amazon,
-[s3a://landsat-pds.s3.amazonaws.com/scene_list.gz](http://landsat-pds.s3.amazonaws.com/scene_list.gz).
-This is a gzipped CSV index of other files which amazon serves for open use.
+text file. The default file for these tests is a public one.
+`s3a://noaa-cors-pds/raw/2023/001/akse/AKSE001a.23_.gz`
+from the [NOAA Continuously Operating Reference Stations (CORS) Network (NCN)](https://registry.opendata.aws/noaa-ncn/)
 
 
 Historically it was required to be a `csv.gz` file to validate S3 Select
 Historically it was required to be a `csv.gz` file to validate S3 Select
 support. Now that S3 Select support has been removed, other large files
 support. Now that S3 Select support has been removed, other large files
 may be used instead.
 may be used instead.
-However, future versions may want to read a CSV file again, so testers
-should still reference one.
 
 
 The path to this object is set in the option `fs.s3a.scale.test.csvfile`,
 The path to this object is set in the option `fs.s3a.scale.test.csvfile`,
 
 
 ```xml
 ```xml
 <property>
 <property>
   <name>fs.s3a.scale.test.csvfile</name>
   <name>fs.s3a.scale.test.csvfile</name>
-  <value>s3a://landsat-pds/scene_list.gz</value>
+  <value>s3a://noaa-cors-pds/raw/2023/001/akse/AKSE001a.23_.gz</value>
 </property>
 </property>
 ```
 ```
 
 
@@ -285,6 +283,7 @@ is hosted in Amazon's US-east datacenter.
 1. If the data cannot be read for any reason then the test will fail.
 1. If the data cannot be read for any reason then the test will fail.
 1. If the property is set to a different path, then that data must be readable
 1. If the property is set to a different path, then that data must be readable
 and "sufficiently" large.
 and "sufficiently" large.
+1. If a `.gz` file, expect decompression-related test failures.
 
 
 (the reason the space or newline is needed is to add "an empty entry"; an empty
 (the reason the space or newline is needed is to add "an empty entry"; an empty
 `<value/>` would be considered undefined and pick up the default)
 `<value/>` would be considered undefined and pick up the default)
@@ -292,14 +291,13 @@ and "sufficiently" large.
 
 
 If using a test file in a different AWS S3 region then
 If using a test file in a different AWS S3 region then
 a bucket-specific region must be defined.
 a bucket-specific region must be defined.
-For the default test dataset, hosted in the `landsat-pds` bucket, this is:
+For the default test dataset, hosted in the `noaa-cors-pds` bucket, this is:
 
 
 ```xml
 ```xml
-<property>
-  <name>fs.s3a.bucket.landsat-pds.endpoint.region</name>
-  <value>us-west-2</value>
-  <description>The region for s3a://landsat-pds</description>
-</property>
+  <property>
+    <name>fs.s3a.bucket.noaa-cors-pds.endpoint.region</name>
+    <value>us-east-1</value>
+  </property>
 ```
 ```
 
 
 ### <a name="access"></a> Testing Access Point Integration
 ### <a name="access"></a> Testing Access Point Integration
@@ -857,7 +855,7 @@ the tests become skipped, rather than fail with a trace which is really a false
 The ordered test case mechanism of `AbstractSTestS3AHugeFiles` is probably
 The ordered test case mechanism of `AbstractSTestS3AHugeFiles` is probably
 the most elegant way of chaining test setup/teardown.
 the most elegant way of chaining test setup/teardown.
 
 
-Regarding reusing existing data, we tend to use the landsat archive of
+Regarding reusing existing data, we tend to use the noaa-cors-pds archive of
 AWS US-East for our testing of input stream operations. This doesn't work
 AWS US-East for our testing of input stream operations. This doesn't work
 against other regions, or with third party S3 implementations. Thus the
 against other regions, or with third party S3 implementations. Thus the
 URL can be overridden for testing elsewhere.
 URL can be overridden for testing elsewhere.

+ 2 - 2
hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AAWSCredentialsProvider.java

@@ -40,10 +40,10 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.slf4j.LoggerFactory;
 
 
 import static org.apache.hadoop.fs.s3a.Constants.*;
 import static org.apache.hadoop.fs.s3a.Constants.*;
-import static org.apache.hadoop.fs.s3a.S3ATestUtils.getCSVTestPath;
 import static org.apache.hadoop.fs.s3a.S3ATestUtils.removeBaseAndBucketOverrides;
 import static org.apache.hadoop.fs.s3a.S3ATestUtils.removeBaseAndBucketOverrides;
 import static org.apache.hadoop.fs.s3a.auth.delegation.DelegationConstants.DELEGATION_TOKEN_BINDING;
 import static org.apache.hadoop.fs.s3a.auth.delegation.DelegationConstants.DELEGATION_TOKEN_BINDING;
 import static org.apache.hadoop.fs.s3a.impl.InstantiationIOException.CONSTRUCTOR_EXCEPTION;
 import static org.apache.hadoop.fs.s3a.impl.InstantiationIOException.CONSTRUCTOR_EXCEPTION;
+import static org.apache.hadoop.fs.s3a.test.PublicDatasetTestUtils.getExternalData;
 import static org.apache.hadoop.test.LambdaTestUtils.intercept;
 import static org.apache.hadoop.test.LambdaTestUtils.intercept;
 import static org.junit.Assert.*;
 import static org.junit.Assert.*;
 
 
@@ -207,7 +207,7 @@ public class ITestS3AAWSCredentialsProvider {
   @Test
   @Test
   public void testAnonymousProvider() throws Exception {
   public void testAnonymousProvider() throws Exception {
     Configuration conf = createConf(AnonymousAWSCredentialsProvider.class);
     Configuration conf = createConf(AnonymousAWSCredentialsProvider.class);
-    Path testFile = getCSVTestPath(conf);
+    Path testFile = getExternalData(conf);
     try (FileSystem fs = FileSystem.newInstance(testFile.toUri(), conf)) {
     try (FileSystem fs = FileSystem.newInstance(testFile.toUri(), conf)) {
       Assertions.assertThat(fs)
       Assertions.assertThat(fs)
           .describedAs("Filesystem")
           .describedAs("Filesystem")

+ 14 - 23
hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AFailureHandling.java

@@ -22,7 +22,6 @@ import software.amazon.awssdk.services.s3.model.ObjectIdentifier;
 import software.amazon.awssdk.services.s3.model.S3Error;
 import software.amazon.awssdk.services.s3.model.S3Error;
 
 
 import org.assertj.core.api.Assertions;
 import org.assertj.core.api.Assertions;
-import org.junit.Assume;
 
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.LocatedFileStatus;
 import org.apache.hadoop.fs.LocatedFileStatus;
@@ -47,6 +46,7 @@ import static org.apache.hadoop.fs.contract.ContractTestUtils.*;
 import static org.apache.hadoop.fs.s3a.S3ATestUtils.createFiles;
 import static org.apache.hadoop.fs.s3a.S3ATestUtils.createFiles;
 import static org.apache.hadoop.fs.s3a.S3ATestUtils.isBulkDeleteEnabled;
 import static org.apache.hadoop.fs.s3a.S3ATestUtils.isBulkDeleteEnabled;
 import static org.apache.hadoop.fs.s3a.test.ExtraAssertions.failIf;
 import static org.apache.hadoop.fs.s3a.test.ExtraAssertions.failIf;
+import static org.apache.hadoop.fs.s3a.test.PublicDatasetTestUtils.requireDefaultExternalData;
 import static org.apache.hadoop.test.LambdaTestUtils.*;
 import static org.apache.hadoop.test.LambdaTestUtils.*;
 import static org.apache.hadoop.util.functional.RemoteIterators.mappingRemoteIterator;
 import static org.apache.hadoop.util.functional.RemoteIterators.mappingRemoteIterator;
 import static org.apache.hadoop.util.functional.RemoteIterators.toList;
 import static org.apache.hadoop.util.functional.RemoteIterators.toList;
@@ -156,31 +156,22 @@ public class ITestS3AFailureHandling extends AbstractS3ATestBase {
     timer.end("removeKeys");
     timer.end("removeKeys");
   }
   }
 
 
-
-  private Path maybeGetCsvPath() {
-    Configuration conf = getConfiguration();
-    String csvFile = conf.getTrimmed(KEY_CSVTEST_FILE, DEFAULT_CSVTEST_FILE);
-    Assume.assumeTrue("CSV test file is not the default",
-        DEFAULT_CSVTEST_FILE.equals(csvFile));
-    return new Path(csvFile);
-  }
-
   /**
   /**
    * Test low-level failure handling with low level delete request.
    * Test low-level failure handling with low level delete request.
    */
    */
   @Test
   @Test
   public void testMultiObjectDeleteNoPermissions() throws Throwable {
   public void testMultiObjectDeleteNoPermissions() throws Throwable {
-    describe("Delete the landsat CSV file and expect it to fail");
-    Path csvPath = maybeGetCsvPath();
-    S3AFileSystem fs = (S3AFileSystem) csvPath.getFileSystem(
+    describe("Delete the external file and expect it to fail");
+    Path path = requireDefaultExternalData(getConfiguration());
+    S3AFileSystem fs = (S3AFileSystem) path.getFileSystem(
         getConfiguration());
         getConfiguration());
     // create a span, expect it to be activated.
     // create a span, expect it to be activated.
     fs.getAuditSpanSource().createSpan(StoreStatisticNames.OP_DELETE,
     fs.getAuditSpanSource().createSpan(StoreStatisticNames.OP_DELETE,
-        csvPath.toString(), null);
+        path.toString(), null);
     List<ObjectIdentifier> keys
     List<ObjectIdentifier> keys
         = buildDeleteRequest(
         = buildDeleteRequest(
             new String[]{
             new String[]{
-                fs.pathToKey(csvPath),
+                fs.pathToKey(path),
                 "missing-key.csv"
                 "missing-key.csv"
             });
             });
     MultiObjectDeleteException ex = intercept(
     MultiObjectDeleteException ex = intercept(
@@ -193,10 +184,10 @@ public class ITestS3AFailureHandling extends AbstractS3ATestBase {
     final String undeletedFiles = undeleted.stream()
     final String undeletedFiles = undeleted.stream()
         .map(Path::toString)
         .map(Path::toString)
         .collect(Collectors.joining(", "));
         .collect(Collectors.joining(", "));
-    failIf(undeleted.size() != 2,
-        "undeleted list size wrong: " + undeletedFiles,
-        ex);
-    assertTrue("no CSV in " +undeletedFiles, undeleted.contains(csvPath));
+    Assertions.assertThat(undeleted)
+        .describedAs("undeleted files")
+        .hasSize(2)
+        .contains(path);
   }
   }
 
 
   /**
   /**
@@ -205,12 +196,12 @@ public class ITestS3AFailureHandling extends AbstractS3ATestBase {
    */
    */
   @Test
   @Test
   public void testSingleObjectDeleteNoPermissionsTranslated() throws Throwable {
   public void testSingleObjectDeleteNoPermissionsTranslated() throws Throwable {
-    describe("Delete the landsat CSV file and expect it to fail");
-    Path csvPath = maybeGetCsvPath();
-    S3AFileSystem fs = (S3AFileSystem) csvPath.getFileSystem(
+    describe("Delete the external file and expect it to fail");
+    Path path = requireDefaultExternalData(getConfiguration());
+    S3AFileSystem fs = (S3AFileSystem) path.getFileSystem(
         getConfiguration());
         getConfiguration());
     AccessDeniedException aex = intercept(AccessDeniedException.class,
     AccessDeniedException aex = intercept(AccessDeniedException.class,
-        () -> fs.delete(csvPath, false));
+        () -> fs.delete(path, false));
     Throwable cause = aex.getCause();
     Throwable cause = aex.getCause();
     failIf(cause == null, "no nested exception", aex);
     failIf(cause == null, "no nested exception", aex);
   }
   }

+ 47 - 38
hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3APrefetchingCacheFiles.java

@@ -19,8 +19,9 @@
 package org.apache.hadoop.fs.s3a;
 package org.apache.hadoop.fs.s3a;
 
 
 import java.io.File;
 import java.io.File;
-import java.net.URI;
+import java.util.UUID;
 
 
+import org.assertj.core.api.Assertions;
 import org.junit.Before;
 import org.junit.Before;
 import org.junit.Test;
 import org.junit.Test;
 import org.slf4j.Logger;
 import org.slf4j.Logger;
@@ -30,15 +31,16 @@ import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FSDataInputStream;
 import org.apache.hadoop.fs.FSDataInputStream;
 import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.LocalFileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.contract.ContractTestUtils;
 import org.apache.hadoop.fs.contract.ContractTestUtils;
 import org.apache.hadoop.fs.permission.FsAction;
 import org.apache.hadoop.fs.permission.FsAction;
 import org.apache.hadoop.fs.s3a.performance.AbstractS3ACostTest;
 import org.apache.hadoop.fs.s3a.performance.AbstractS3ACostTest;
 
 
 import static org.apache.hadoop.fs.s3a.Constants.BUFFER_DIR;
 import static org.apache.hadoop.fs.s3a.Constants.BUFFER_DIR;
-import static org.apache.hadoop.fs.s3a.Constants.PREFETCH_BLOCK_DEFAULT_SIZE;
 import static org.apache.hadoop.fs.s3a.Constants.PREFETCH_BLOCK_SIZE_KEY;
 import static org.apache.hadoop.fs.s3a.Constants.PREFETCH_BLOCK_SIZE_KEY;
 import static org.apache.hadoop.fs.s3a.Constants.PREFETCH_ENABLED_KEY;
 import static org.apache.hadoop.fs.s3a.Constants.PREFETCH_ENABLED_KEY;
+import static org.apache.hadoop.fs.s3a.test.PublicDatasetTestUtils.getExternalData;
 import static org.apache.hadoop.io.IOUtils.cleanupWithLogger;
 import static org.apache.hadoop.io.IOUtils.cleanupWithLogger;
 
 
 /**
 /**
@@ -49,11 +51,21 @@ public class ITestS3APrefetchingCacheFiles extends AbstractS3ACostTest {
   private static final Logger LOG =
   private static final Logger LOG =
       LoggerFactory.getLogger(ITestS3APrefetchingCacheFiles.class);
       LoggerFactory.getLogger(ITestS3APrefetchingCacheFiles.class);
 
 
+  /** use a small file size so small source files will still work. */
+  public static final int BLOCK_SIZE = 128 * 1024;
+
+  public static final int PREFETCH_OFFSET = 10240;
+
   private Path testFile;
   private Path testFile;
+
+  /** The FS with the external file. */
   private FileSystem fs;
   private FileSystem fs;
+
   private int prefetchBlockSize;
   private int prefetchBlockSize;
   private Configuration conf;
   private Configuration conf;
 
 
+  private String bufferDir;
+
   public ITestS3APrefetchingCacheFiles() {
   public ITestS3APrefetchingCacheFiles() {
     super(true);
     super(true);
   }
   }
@@ -63,35 +75,31 @@ public class ITestS3APrefetchingCacheFiles extends AbstractS3ACostTest {
     super.setup();
     super.setup();
     // Sets BUFFER_DIR by calling S3ATestUtils#prepareTestConfiguration
     // Sets BUFFER_DIR by calling S3ATestUtils#prepareTestConfiguration
     conf = createConfiguration();
     conf = createConfiguration();
-    String testFileUri = S3ATestUtils.getCSVTestFile(conf);
 
 
-    testFile = new Path(testFileUri);
-    prefetchBlockSize = conf.getInt(PREFETCH_BLOCK_SIZE_KEY, PREFETCH_BLOCK_DEFAULT_SIZE);
-    fs = getFileSystem();
-    fs.initialize(new URI(testFileUri), conf);
+    testFile = getExternalData(conf);
+    prefetchBlockSize = conf.getInt(PREFETCH_BLOCK_SIZE_KEY, BLOCK_SIZE);
+    fs = FileSystem.get(testFile.toUri(), conf);
   }
   }
 
 
   @Override
   @Override
   public Configuration createConfiguration() {
   public Configuration createConfiguration() {
     Configuration configuration = super.createConfiguration();
     Configuration configuration = super.createConfiguration();
     S3ATestUtils.removeBaseAndBucketOverrides(configuration, PREFETCH_ENABLED_KEY);
     S3ATestUtils.removeBaseAndBucketOverrides(configuration, PREFETCH_ENABLED_KEY);
-    S3ATestUtils.removeBaseAndBucketOverrides(configuration, PREFETCH_BLOCK_SIZE_KEY);
     configuration.setBoolean(PREFETCH_ENABLED_KEY, true);
     configuration.setBoolean(PREFETCH_ENABLED_KEY, true);
+    // use a small block size unless explicitly set in the test config.
+    configuration.setInt(PREFETCH_BLOCK_SIZE_KEY, BLOCK_SIZE);
+    // patch buffer dir with a unique path for test isolation.
+    final String bufferDirBase = configuration.get(BUFFER_DIR);
+    bufferDir = bufferDirBase + "/" + UUID.randomUUID();
+    configuration.set(BUFFER_DIR, bufferDir);
     return configuration;
     return configuration;
   }
   }
 
 
   @Override
   @Override
   public synchronized void teardown() throws Exception {
   public synchronized void teardown() throws Exception {
     super.teardown();
     super.teardown();
-    File tmpFileDir = new File(conf.get(BUFFER_DIR));
-    File[] tmpFiles = tmpFileDir.listFiles();
-    if (tmpFiles != null) {
-      for (File filePath : tmpFiles) {
-        String path = filePath.getPath();
-        if (path.endsWith(".bin") && path.contains("fs-cache-")) {
-          filePath.delete();
-        }
-      }
+    if (bufferDir != null) {
+      new File(bufferDir).delete();
     }
     }
     cleanupWithLogger(LOG, fs);
     cleanupWithLogger(LOG, fs);
     fs = null;
     fs = null;
@@ -111,34 +119,35 @@ public class ITestS3APrefetchingCacheFiles extends AbstractS3ACostTest {
     try (FSDataInputStream in = fs.open(testFile)) {
     try (FSDataInputStream in = fs.open(testFile)) {
       byte[] buffer = new byte[prefetchBlockSize];
       byte[] buffer = new byte[prefetchBlockSize];
 
 
-      in.read(buffer, 0, prefetchBlockSize - 10240);
-      in.seek(prefetchBlockSize * 2);
-      in.read(buffer, 0, prefetchBlockSize);
+      // read a bit less than a block
+      in.readFully(0, buffer, 0, prefetchBlockSize - PREFETCH_OFFSET);
+      // read at least some of a second block
+      in.read(prefetchBlockSize * 2, buffer, 0, prefetchBlockSize);
+
 
 
       File tmpFileDir = new File(conf.get(BUFFER_DIR));
       File tmpFileDir = new File(conf.get(BUFFER_DIR));
-      assertTrue("The dir to keep cache files must exist", tmpFileDir.exists());
+      final LocalFileSystem localFs = FileSystem.getLocal(conf);
+      Path bufferDirPath = new Path(tmpFileDir.toURI());
+      ContractTestUtils.assertIsDirectory(localFs, bufferDirPath);
       File[] tmpFiles = tmpFileDir
       File[] tmpFiles = tmpFileDir
           .listFiles((dir, name) -> name.endsWith(".bin") && name.contains("fs-cache-"));
           .listFiles((dir, name) -> name.endsWith(".bin") && name.contains("fs-cache-"));
-      boolean isCacheFileForBlockFound = tmpFiles != null && tmpFiles.length > 0;
-      if (!isCacheFileForBlockFound) {
-        LOG.warn("No cache files found under " + tmpFileDir);
-      }
-      assertTrue("File to cache block data must exist", isCacheFileForBlockFound);
+      Assertions.assertThat(tmpFiles)
+          .describedAs("Cache files not found under %s", tmpFileDir)
+          .isNotEmpty();
+
 
 
       for (File tmpFile : tmpFiles) {
       for (File tmpFile : tmpFiles) {
         Path path = new Path(tmpFile.getAbsolutePath());
         Path path = new Path(tmpFile.getAbsolutePath());
-        try (FileSystem localFs = FileSystem.getLocal(conf)) {
-          FileStatus stat = localFs.getFileStatus(path);
-          ContractTestUtils.assertIsFile(path, stat);
-          assertEquals("File length not matching with prefetchBlockSize", prefetchBlockSize,
-              stat.getLen());
-          assertEquals("User permissions should be RW", FsAction.READ_WRITE,
-              stat.getPermission().getUserAction());
-          assertEquals("Group permissions should be NONE", FsAction.NONE,
-              stat.getPermission().getGroupAction());
-          assertEquals("Other permissions should be NONE", FsAction.NONE,
-              stat.getPermission().getOtherAction());
-        }
+        FileStatus stat = localFs.getFileStatus(path);
+        ContractTestUtils.assertIsFile(path, stat);
+        assertEquals("File length not matching with prefetchBlockSize", prefetchBlockSize,
+            stat.getLen());
+        assertEquals("User permissions should be RW", FsAction.READ_WRITE,
+            stat.getPermission().getUserAction());
+        assertEquals("Group permissions should be NONE", FsAction.NONE,
+            stat.getPermission().getGroupAction());
+        assertEquals("Other permissions should be NONE", FsAction.NONE,
+            stat.getPermission().getOtherAction());
       }
       }
     }
     }
   }
   }

+ 6 - 4
hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/S3ATestConstants.java

@@ -111,14 +111,16 @@ public interface S3ATestConstants {
   String KEY_CSVTEST_FILE = S3A_SCALE_TEST + "csvfile";
   String KEY_CSVTEST_FILE = S3A_SCALE_TEST + "csvfile";
 
 
   /**
   /**
-   * The landsat bucket: {@value}.
+   * Default path for the multi MB test file: {@value}.
+   * @deprecated retrieve via {@link PublicDatasetTestUtils}.
    */
    */
-  String LANDSAT_BUCKET = "s3a://landsat-pds/";
+  @Deprecated
+  String DEFAULT_CSVTEST_FILE = PublicDatasetTestUtils.DEFAULT_EXTERNAL_FILE;
 
 
   /**
   /**
-   * Default path for the multi MB test file: {@value}.
+   * Example path for unit tests; this is never accessed: {@value}.
    */
    */
-  String DEFAULT_CSVTEST_FILE = LANDSAT_BUCKET + "scene_list.gz";
+  String UNIT_TEST_EXAMPLE_PATH = "s3a://example/data/";
 
 
   /**
   /**
    * Configuration key for an existing object in a requester pays bucket: {@value}.
    * Configuration key for an existing object in a requester pays bucket: {@value}.

+ 14 - 11
hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/S3ATestUtils.java

@@ -105,6 +105,8 @@ import static org.apache.hadoop.fs.contract.ContractTestUtils.createFile;
 import static org.apache.hadoop.fs.s3a.impl.CallableSupplier.submit;
 import static org.apache.hadoop.fs.s3a.impl.CallableSupplier.submit;
 import static org.apache.hadoop.fs.s3a.impl.CallableSupplier.waitForCompletion;
 import static org.apache.hadoop.fs.s3a.impl.CallableSupplier.waitForCompletion;
 import static org.apache.hadoop.fs.s3a.impl.S3ExpressStorage.STORE_CAPABILITY_S3_EXPRESS_STORAGE;
 import static org.apache.hadoop.fs.s3a.impl.S3ExpressStorage.STORE_CAPABILITY_S3_EXPRESS_STORAGE;
+import static org.apache.hadoop.fs.s3a.test.PublicDatasetTestUtils.getExternalData;
+import static org.apache.hadoop.fs.s3a.test.PublicDatasetTestUtils.requireDefaultExternalDataFile;
 import static org.apache.hadoop.test.GenericTestUtils.buildPaths;
 import static org.apache.hadoop.test.GenericTestUtils.buildPaths;
 import static org.apache.hadoop.util.Preconditions.checkNotNull;
 import static org.apache.hadoop.util.Preconditions.checkNotNull;
 import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.HADOOP_SECURITY_CREDENTIAL_PROVIDER_PATH;
 import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.HADOOP_SECURITY_CREDENTIAL_PROVIDER_PATH;
@@ -405,22 +407,22 @@ public final class S3ATestUtils {
    * Get the test CSV file; assume() that it is not empty.
    * Get the test CSV file; assume() that it is not empty.
    * @param conf test configuration
    * @param conf test configuration
    * @return test file.
    * @return test file.
+   * @deprecated Retained only to assist cherrypicking patches
    */
    */
+  @Deprecated
   public static String getCSVTestFile(Configuration conf) {
   public static String getCSVTestFile(Configuration conf) {
-    String csvFile = conf
-        .getTrimmed(KEY_CSVTEST_FILE, DEFAULT_CSVTEST_FILE);
-    Assume.assumeTrue("CSV test file is not the default",
-        isNotEmpty(csvFile));
-    return csvFile;
+    return getExternalData(conf).toUri().toString();
   }
   }
 
 
   /**
   /**
    * Get the test CSV path; assume() that it is not empty.
    * Get the test CSV path; assume() that it is not empty.
    * @param conf test configuration
    * @param conf test configuration
    * @return test file as a path.
    * @return test file as a path.
+   * @deprecated Retained only to assist cherrypicking patches
    */
    */
+  @Deprecated
   public static Path getCSVTestPath(Configuration conf) {
   public static Path getCSVTestPath(Configuration conf) {
-    return new Path(getCSVTestFile(conf));
+    return getExternalData(conf);
   }
   }
 
 
   /**
   /**
@@ -429,12 +431,11 @@ public final class S3ATestUtils {
    * read only).
    * read only).
    * @return test file.
    * @return test file.
    * @param conf test configuration
    * @param conf test configuration
+   * @deprecated Retained only to assist cherrypicking patches
    */
    */
+  @Deprecated
   public static String getLandsatCSVFile(Configuration conf) {
   public static String getLandsatCSVFile(Configuration conf) {
-    String csvFile = getCSVTestFile(conf);
-    Assume.assumeTrue("CSV test file is not the default",
-        DEFAULT_CSVTEST_FILE.equals(csvFile));
-    return csvFile;
+    return requireDefaultExternalDataFile(conf);
   }
   }
   /**
   /**
    * Get the test CSV file; assume() that it is not modified (i.e. we haven't
    * Get the test CSV file; assume() that it is not modified (i.e. we haven't
@@ -442,9 +443,11 @@ public final class S3ATestUtils {
    * read only).
    * read only).
    * @param conf test configuration
    * @param conf test configuration
    * @return test file as a path.
    * @return test file as a path.
+   * @deprecated Retained only to assist cherrypicking patches
    */
    */
+  @Deprecated
   public static Path getLandsatCSVPath(Configuration conf) {
   public static Path getLandsatCSVPath(Configuration conf) {
-    return new Path(getLandsatCSVFile(conf));
+    return getExternalData(conf);
   }
   }
 
 
   /**
   /**

+ 9 - 12
hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/TestS3AAWSCredentialsProvider.java

@@ -54,37 +54,34 @@ import org.apache.hadoop.fs.s3a.auth.IAMInstanceCredentialsProvider;
 import org.apache.hadoop.fs.s3a.auth.NoAuthWithAWSException;
 import org.apache.hadoop.fs.s3a.auth.NoAuthWithAWSException;
 import org.apache.hadoop.fs.s3a.auth.delegation.CountInvocationsProvider;
 import org.apache.hadoop.fs.s3a.auth.delegation.CountInvocationsProvider;
 import org.apache.hadoop.fs.s3a.impl.InstantiationIOException;
 import org.apache.hadoop.fs.s3a.impl.InstantiationIOException;
+import org.apache.hadoop.fs.s3a.test.PublicDatasetTestUtils;
 import org.apache.hadoop.io.retry.RetryPolicy;
 import org.apache.hadoop.io.retry.RetryPolicy;
 import org.apache.hadoop.util.Sets;
 import org.apache.hadoop.util.Sets;
 
 
 import static org.apache.hadoop.fs.s3a.Constants.ASSUMED_ROLE_CREDENTIALS_PROVIDER;
 import static org.apache.hadoop.fs.s3a.Constants.ASSUMED_ROLE_CREDENTIALS_PROVIDER;
 import static org.apache.hadoop.fs.s3a.Constants.AWS_CREDENTIALS_PROVIDER;
 import static org.apache.hadoop.fs.s3a.Constants.AWS_CREDENTIALS_PROVIDER;
 import static org.apache.hadoop.fs.s3a.Constants.AWS_CREDENTIALS_PROVIDER_MAPPING;
 import static org.apache.hadoop.fs.s3a.Constants.AWS_CREDENTIALS_PROVIDER_MAPPING;
-import static org.apache.hadoop.fs.s3a.S3ATestConstants.DEFAULT_CSVTEST_FILE;
 import static org.apache.hadoop.fs.s3a.S3ATestUtils.authenticationContains;
 import static org.apache.hadoop.fs.s3a.S3ATestUtils.authenticationContains;
 import static org.apache.hadoop.fs.s3a.S3ATestUtils.buildClassListString;
 import static org.apache.hadoop.fs.s3a.S3ATestUtils.buildClassListString;
-import static org.apache.hadoop.fs.s3a.S3ATestUtils.getCSVTestPath;
 import static org.apache.hadoop.fs.s3a.auth.CredentialProviderListFactory.STANDARD_AWS_PROVIDERS;
 import static org.apache.hadoop.fs.s3a.auth.CredentialProviderListFactory.STANDARD_AWS_PROVIDERS;
 import static org.apache.hadoop.fs.s3a.auth.CredentialProviderListFactory.buildAWSProviderList;
 import static org.apache.hadoop.fs.s3a.auth.CredentialProviderListFactory.buildAWSProviderList;
 import static org.apache.hadoop.fs.s3a.auth.CredentialProviderListFactory.createAWSCredentialProviderList;
 import static org.apache.hadoop.fs.s3a.auth.CredentialProviderListFactory.createAWSCredentialProviderList;
 import static org.apache.hadoop.fs.s3a.impl.InstantiationIOException.DOES_NOT_IMPLEMENT;
 import static org.apache.hadoop.fs.s3a.impl.InstantiationIOException.DOES_NOT_IMPLEMENT;
+import static org.apache.hadoop.fs.s3a.test.PublicDatasetTestUtils.getExternalData;
 import static org.apache.hadoop.test.LambdaTestUtils.intercept;
 import static org.apache.hadoop.test.LambdaTestUtils.intercept;
 import static org.apache.hadoop.test.LambdaTestUtils.interceptFuture;
 import static org.apache.hadoop.test.LambdaTestUtils.interceptFuture;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertNotNull;
-import static org.junit.Assert.assertTrue;
 
 
 /**
 /**
  * Unit tests for {@link Constants#AWS_CREDENTIALS_PROVIDER} logic.
  * Unit tests for {@link Constants#AWS_CREDENTIALS_PROVIDER} logic.
  */
  */
-public class TestS3AAWSCredentialsProvider {
+public class TestS3AAWSCredentialsProvider extends AbstractS3ATestBase {
 
 
   /**
   /**
-   * URI of the landsat images.
+   * URI of the test file: this must be anonymously accessible.
+   * As these are unit tests no actual connection to the store is made.
    */
    */
   private static final URI TESTFILE_URI = new Path(
   private static final URI TESTFILE_URI = new Path(
-      DEFAULT_CSVTEST_FILE).toUri();
+      PublicDatasetTestUtils.DEFAULT_EXTERNAL_FILE).toUri();
 
 
   private static final Logger LOG = LoggerFactory.getLogger(TestS3AAWSCredentialsProvider.class);
   private static final Logger LOG = LoggerFactory.getLogger(TestS3AAWSCredentialsProvider.class);
 
 
@@ -127,7 +124,7 @@ public class TestS3AAWSCredentialsProvider {
         TemporaryAWSCredentialsProvider.NAME
         TemporaryAWSCredentialsProvider.NAME
             + ", \t" + SimpleAWSCredentialsProvider.NAME
             + ", \t" + SimpleAWSCredentialsProvider.NAME
             + " ,\n " + AnonymousAWSCredentialsProvider.NAME);
             + " ,\n " + AnonymousAWSCredentialsProvider.NAME);
-    Path testFile = getCSVTestPath(conf);
+    Path testFile = getExternalData(conf);
 
 
     AWSCredentialProviderList list = createAWSCredentialProviderList(
     AWSCredentialProviderList list = createAWSCredentialProviderList(
         testFile.toUri(), conf);
         testFile.toUri(), conf);
@@ -586,7 +583,7 @@ public class TestS3AAWSCredentialsProvider {
   @Test
   @Test
   public void testConcurrentAuthentication() throws Throwable {
   public void testConcurrentAuthentication() throws Throwable {
     Configuration conf = createProviderConfiguration(SlowProvider.class.getName());
     Configuration conf = createProviderConfiguration(SlowProvider.class.getName());
-    Path testFile = getCSVTestPath(conf);
+    Path testFile = getExternalData(conf);
 
 
     AWSCredentialProviderList list = createAWSCredentialProviderList(testFile.toUri(), conf);
     AWSCredentialProviderList list = createAWSCredentialProviderList(testFile.toUri(), conf);
 
 
@@ -656,7 +653,7 @@ public class TestS3AAWSCredentialsProvider {
   @Test
   @Test
   public void testConcurrentAuthenticationError() throws Throwable {
   public void testConcurrentAuthenticationError() throws Throwable {
     Configuration conf = createProviderConfiguration(ErrorProvider.class.getName());
     Configuration conf = createProviderConfiguration(ErrorProvider.class.getName());
-    Path testFile = getCSVTestPath(conf);
+    Path testFile = getExternalData(conf);
 
 
     AWSCredentialProviderList list = createAWSCredentialProviderList(testFile.toUri(), conf);
     AWSCredentialProviderList list = createAWSCredentialProviderList(testFile.toUri(), conf);
     ErrorProvider provider = (ErrorProvider) list.getProviders().get(0);
     ErrorProvider provider = (ErrorProvider) list.getProviders().get(0);

+ 3 - 3
hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/adapter/TestV1CredentialsProvider.java

@@ -39,9 +39,9 @@ import org.apache.hadoop.fs.s3a.AWSCredentialProviderList;
 import org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider;
 import org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider;
 import org.apache.hadoop.fs.s3a.auth.IAMInstanceCredentialsProvider;
 import org.apache.hadoop.fs.s3a.auth.IAMInstanceCredentialsProvider;
 import org.apache.hadoop.fs.s3a.impl.InstantiationIOException;
 import org.apache.hadoop.fs.s3a.impl.InstantiationIOException;
+import org.apache.hadoop.fs.s3a.test.PublicDatasetTestUtils;
 
 
 import static org.apache.hadoop.fs.s3a.Constants.AWS_CREDENTIALS_PROVIDER;
 import static org.apache.hadoop.fs.s3a.Constants.AWS_CREDENTIALS_PROVIDER;
-import static org.apache.hadoop.fs.s3a.S3ATestConstants.DEFAULT_CSVTEST_FILE;
 import static org.apache.hadoop.fs.s3a.auth.CredentialProviderListFactory.ANONYMOUS_CREDENTIALS_V1;
 import static org.apache.hadoop.fs.s3a.auth.CredentialProviderListFactory.ANONYMOUS_CREDENTIALS_V1;
 import static org.apache.hadoop.fs.s3a.auth.CredentialProviderListFactory.EC2_CONTAINER_CREDENTIALS_V1;
 import static org.apache.hadoop.fs.s3a.auth.CredentialProviderListFactory.EC2_CONTAINER_CREDENTIALS_V1;
 import static org.apache.hadoop.fs.s3a.auth.CredentialProviderListFactory.ENVIRONMENT_CREDENTIALS_V1;
 import static org.apache.hadoop.fs.s3a.auth.CredentialProviderListFactory.ENVIRONMENT_CREDENTIALS_V1;
@@ -56,10 +56,10 @@ import static org.junit.Assert.assertTrue;
 public class TestV1CredentialsProvider {
 public class TestV1CredentialsProvider {
 
 
   /**
   /**
-   * URI of the landsat images.
+   * URI of the test file.
    */
    */
   private static final URI TESTFILE_URI = new Path(
   private static final URI TESTFILE_URI = new Path(
-      DEFAULT_CSVTEST_FILE).toUri();
+      PublicDatasetTestUtils.DEFAULT_EXTERNAL_FILE).toUri();
 
 
   private static final Logger LOG = LoggerFactory.getLogger(TestV1CredentialsProvider.class);
   private static final Logger LOG = LoggerFactory.getLogger(TestV1CredentialsProvider.class);
 
 

+ 2 - 2
hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/ITestAssumeRole.java

@@ -46,7 +46,6 @@ import org.apache.hadoop.fs.contract.ContractTestUtils;
 import org.apache.hadoop.fs.s3a.AWSBadRequestException;
 import org.apache.hadoop.fs.s3a.AWSBadRequestException;
 import org.apache.hadoop.fs.s3a.AbstractS3ATestBase;
 import org.apache.hadoop.fs.s3a.AbstractS3ATestBase;
 import org.apache.hadoop.fs.s3a.S3AFileSystem;
 import org.apache.hadoop.fs.s3a.S3AFileSystem;
-import org.apache.hadoop.fs.s3a.S3ATestConstants;
 import org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider;
 import org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider;
 import org.apache.hadoop.fs.s3a.commit.CommitConstants;
 import org.apache.hadoop.fs.s3a.commit.CommitConstants;
 import org.apache.hadoop.fs.s3a.commit.files.PendingSet;
 import org.apache.hadoop.fs.s3a.commit.files.PendingSet;
@@ -68,6 +67,7 @@ import static org.apache.hadoop.fs.s3a.auth.RolePolicies.*;
 import static org.apache.hadoop.fs.s3a.auth.RoleTestUtils.forbidden;
 import static org.apache.hadoop.fs.s3a.auth.RoleTestUtils.forbidden;
 import static org.apache.hadoop.fs.s3a.auth.RoleTestUtils.newAssumedRoleConfig;
 import static org.apache.hadoop.fs.s3a.auth.RoleTestUtils.newAssumedRoleConfig;
 import static org.apache.hadoop.fs.s3a.s3guard.S3GuardToolTestHelper.exec;
 import static org.apache.hadoop.fs.s3a.s3guard.S3GuardToolTestHelper.exec;
+import static org.apache.hadoop.fs.s3a.test.PublicDatasetTestUtils.requireAnonymousDataPath;
 import static org.apache.hadoop.fs.statistics.IOStatisticsLogging.ioStatisticsSourceToString;
 import static org.apache.hadoop.fs.statistics.IOStatisticsLogging.ioStatisticsSourceToString;
 import static org.apache.hadoop.io.IOUtils.cleanupWithLogger;
 import static org.apache.hadoop.io.IOUtils.cleanupWithLogger;
 import static org.apache.hadoop.test.GenericTestUtils.assertExceptionContains;
 import static org.apache.hadoop.test.GenericTestUtils.assertExceptionContains;
@@ -115,7 +115,7 @@ public class ITestAssumeRole extends AbstractS3ATestBase {
   public void setup() throws Exception {
   public void setup() throws Exception {
     super.setup();
     super.setup();
     assumeRoleTests();
     assumeRoleTests();
-    uri = new URI(S3ATestConstants.DEFAULT_CSVTEST_FILE);
+    uri = requireAnonymousDataPath(getConfiguration()).toUri();
   }
   }
 
 
   @Override
   @Override

+ 21 - 10
hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/delegation/ITestDelegatedMRJob.java

@@ -58,6 +58,8 @@ import static org.apache.hadoop.fs.s3a.auth.RoleTestUtils.probeForAssumedRoleARN
 import static org.apache.hadoop.fs.s3a.auth.delegation.DelegationConstants.*;
 import static org.apache.hadoop.fs.s3a.auth.delegation.DelegationConstants.*;
 import static org.apache.hadoop.fs.s3a.auth.delegation.MiniKerberizedHadoopCluster.assertSecurityEnabled;
 import static org.apache.hadoop.fs.s3a.auth.delegation.MiniKerberizedHadoopCluster.assertSecurityEnabled;
 import static org.apache.hadoop.fs.s3a.auth.delegation.MiniKerberizedHadoopCluster.closeUserFileSystems;
 import static org.apache.hadoop.fs.s3a.auth.delegation.MiniKerberizedHadoopCluster.closeUserFileSystems;
+import static org.apache.hadoop.fs.s3a.test.PublicDatasetTestUtils.getOrcData;
+import static org.apache.hadoop.fs.s3a.test.PublicDatasetTestUtils.requireAnonymousDataPath;
 
 
 /**
 /**
  * Submit a job with S3 delegation tokens.
  * Submit a job with S3 delegation tokens.
@@ -106,10 +108,17 @@ public class ITestDelegatedMRJob extends AbstractDelegationIT {
 
 
   private Path destPath;
   private Path destPath;
 
 
-  private static final Path EXTRA_JOB_RESOURCE_PATH
-      = new Path("s3a://osm-pds/planet/planet-latest.orc");
+  /**
+   * Path of the extra job resource; set up in
+   * {@link #createConfiguration()}.
+   */
+  private Path extraJobResourcePath;
 
 
-  public static final URI jobResource = EXTRA_JOB_RESOURCE_PATH.toUri();
+  /**
+   * URI of the extra job resource; set up in
+   * {@link #createConfiguration()}.
+   */
+  private URI jobResourceUri;
 
 
   /**
   /**
    * Test array for parameterized test runs.
    * Test array for parameterized test runs.
@@ -161,7 +170,9 @@ public class ITestDelegatedMRJob extends AbstractDelegationIT {
     conf.setInt(YarnConfiguration.RESOURCEMANAGER_CONNECT_RETRY_INTERVAL_MS,
     conf.setInt(YarnConfiguration.RESOURCEMANAGER_CONNECT_RETRY_INTERVAL_MS,
         10_000);
         10_000);
 
 
-    String host = jobResource.getHost();
+    extraJobResourcePath = getOrcData(conf);
+    jobResourceUri = extraJobResourcePath.toUri();
+    String host = jobResourceUri.getHost();
     // and fix to the main endpoint if the caller has moved
     // and fix to the main endpoint if the caller has moved
     conf.set(
     conf.set(
         String.format("fs.s3a.bucket.%s.endpoint", host), "");
         String.format("fs.s3a.bucket.%s.endpoint", host), "");
@@ -229,9 +240,9 @@ public class ITestDelegatedMRJob extends AbstractDelegationIT {
 
 
   @Test
   @Test
   public void testCommonCrawlLookup() throws Throwable {
   public void testCommonCrawlLookup() throws Throwable {
-    FileSystem resourceFS = EXTRA_JOB_RESOURCE_PATH.getFileSystem(
+    FileSystem resourceFS = extraJobResourcePath.getFileSystem(
         getConfiguration());
         getConfiguration());
-    FileStatus status = resourceFS.getFileStatus(EXTRA_JOB_RESOURCE_PATH);
+    FileStatus status = resourceFS.getFileStatus(extraJobResourcePath);
     LOG.info("Extra job resource is {}", status);
     LOG.info("Extra job resource is {}", status);
     assertTrue("Not encrypted: " + status, status.isEncrypted());
     assertTrue("Not encrypted: " + status, status.isEncrypted());
   }
   }
@@ -241,9 +252,9 @@ public class ITestDelegatedMRJob extends AbstractDelegationIT {
     describe("Mock Job test");
     describe("Mock Job test");
     JobConf conf = new JobConf(getConfiguration());
     JobConf conf = new JobConf(getConfiguration());
 
 
-    // the input here is the landsat file; which lets
+    // the input here is the external file; which lets
     // us differentiate source URI from dest URI
     // us differentiate source URI from dest URI
-    Path input = new Path(DEFAULT_CSVTEST_FILE);
+    Path input = requireAnonymousDataPath(getConfiguration());
     final FileSystem sourceFS = input.getFileSystem(conf);
     final FileSystem sourceFS = input.getFileSystem(conf);
 
 
 
 
@@ -272,7 +283,7 @@ public class ITestDelegatedMRJob extends AbstractDelegationIT {
     // This is to actually stress the terasort code for which
     // This is to actually stress the terasort code for which
     // the yarn ResourceLocalizationService was having problems with
     // the yarn ResourceLocalizationService was having problems with
     // fetching resources from.
     // fetching resources from.
-    URI partitionUri = new URI(EXTRA_JOB_RESOURCE_PATH.toString() +
+    URI partitionUri = new URI(extraJobResourcePath.toString() +
         "#_partition.lst");
         "#_partition.lst");
     job.addCacheFile(partitionUri);
     job.addCacheFile(partitionUri);
 
 
@@ -302,7 +313,7 @@ public class ITestDelegatedMRJob extends AbstractDelegationIT {
     // look up the destination token
     // look up the destination token
     lookupToken(submittedCredentials, fs.getUri(), tokenKind);
     lookupToken(submittedCredentials, fs.getUri(), tokenKind);
     lookupToken(submittedCredentials,
     lookupToken(submittedCredentials,
-        EXTRA_JOB_RESOURCE_PATH.getFileSystem(conf).getUri(), tokenKind);
+        extraJobResourcePath.getFileSystem(conf).getUri(), tokenKind);
   }
   }
 
 
 }
 }

+ 2 - 3
hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/delegation/ITestRoleDelegationInFilesystem.java

@@ -53,8 +53,7 @@ public class ITestRoleDelegationInFilesystem extends
 
 
   /**
   /**
    * This verifies that the granted credentials only access the target bucket
    * This verifies that the granted credentials only access the target bucket
-   * by using the credentials in a new S3 client to query the AWS-owned landsat
-   * bucket.
+   * by using the credentials in a new S3 client to query the public data bucket.
    * @param delegatedFS delegated FS with role-restricted access.
    * @param delegatedFS delegated FS with role-restricted access.
    * @throws Exception failure
    * @throws Exception failure
    */
    */
@@ -62,7 +61,7 @@ public class ITestRoleDelegationInFilesystem extends
   protected void verifyRestrictedPermissions(final S3AFileSystem delegatedFS)
   protected void verifyRestrictedPermissions(final S3AFileSystem delegatedFS)
       throws Exception {
       throws Exception {
     intercept(AccessDeniedException.class,
     intercept(AccessDeniedException.class,
-        () -> readLandsatMetadata(delegatedFS));
+        () -> readExternalDatasetMetadata(delegatedFS));
   }
   }
 
 
 }
 }

+ 9 - 8
hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/delegation/ITestSessionDelegationInFilesystem.java

@@ -79,6 +79,7 @@ import static org.apache.hadoop.fs.s3a.auth.delegation.DelegationTokenIOExceptio
 import static org.apache.hadoop.fs.s3a.auth.delegation.MiniKerberizedHadoopCluster.ALICE;
 import static org.apache.hadoop.fs.s3a.auth.delegation.MiniKerberizedHadoopCluster.ALICE;
 import static org.apache.hadoop.fs.s3a.auth.delegation.MiniKerberizedHadoopCluster.assertSecurityEnabled;
 import static org.apache.hadoop.fs.s3a.auth.delegation.MiniKerberizedHadoopCluster.assertSecurityEnabled;
 import static org.apache.hadoop.fs.s3a.auth.delegation.S3ADelegationTokens.lookupS3ADelegationToken;
 import static org.apache.hadoop.fs.s3a.auth.delegation.S3ADelegationTokens.lookupS3ADelegationToken;
+import static org.apache.hadoop.fs.s3a.test.PublicDatasetTestUtils.requireAnonymousDataPath;
 import static org.apache.hadoop.test.LambdaTestUtils.doAs;
 import static org.apache.hadoop.test.LambdaTestUtils.doAs;
 import static org.apache.hadoop.test.LambdaTestUtils.intercept;
 import static org.apache.hadoop.test.LambdaTestUtils.intercept;
 import static org.hamcrest.Matchers.containsString;
 import static org.hamcrest.Matchers.containsString;
@@ -344,7 +345,7 @@ public class ITestSessionDelegationInFilesystem extends AbstractDelegationIT {
     // TODO: Check what should happen here. Calling headObject() on the root path fails in V2,
     // TODO: Check what should happen here. Calling headObject() on the root path fails in V2,
     // with the error that key cannot be empty.
     // with the error that key cannot be empty.
    // fs.getObjectMetadata(new Path("/"));
    // fs.getObjectMetadata(new Path("/"));
-    readLandsatMetadata(fs);
+    readExternalDatasetMetadata(fs);
 
 
     URI uri = fs.getUri();
     URI uri = fs.getUri();
     // create delegation tokens from the test suites FS.
     // create delegation tokens from the test suites FS.
@@ -463,13 +464,13 @@ public class ITestSessionDelegationInFilesystem extends AbstractDelegationIT {
   }
   }
 
 
   /**
   /**
-   * Session tokens can read the landsat bucket without problems.
+   * Session tokens can read the external bucket without problems.
    * @param delegatedFS delegated FS
    * @param delegatedFS delegated FS
    * @throws Exception failure
    * @throws Exception failure
    */
    */
   protected void verifyRestrictedPermissions(final S3AFileSystem delegatedFS)
   protected void verifyRestrictedPermissions(final S3AFileSystem delegatedFS)
       throws Exception {
       throws Exception {
-    readLandsatMetadata(delegatedFS);
+    readExternalDatasetMetadata(delegatedFS);
   }
   }
 
 
   @Test
   @Test
@@ -582,7 +583,7 @@ public class ITestSessionDelegationInFilesystem extends AbstractDelegationIT {
 
 
   /**
   /**
    * This verifies that the granted credentials only access the target bucket
    * This verifies that the granted credentials only access the target bucket
-   * by using the credentials in a new S3 client to query the AWS-owned landsat
+   * by using the credentials in a new S3 client to query the external
    * bucket.
    * bucket.
    * @param delegatedFS delegated FS with role-restricted access.
    * @param delegatedFS delegated FS with role-restricted access.
    * @throws AccessDeniedException if the delegated FS's credentials can't
    * @throws AccessDeniedException if the delegated FS's credentials can't
@@ -590,17 +591,17 @@ public class ITestSessionDelegationInFilesystem extends AbstractDelegationIT {
    * @return result of the HEAD
    * @return result of the HEAD
    * @throws Exception failure
    * @throws Exception failure
    */
    */
-  protected HeadBucketResponse readLandsatMetadata(final S3AFileSystem delegatedFS)
+  protected HeadBucketResponse readExternalDatasetMetadata(final S3AFileSystem delegatedFS)
       throws Exception {
       throws Exception {
     AWSCredentialProviderList testingCreds
     AWSCredentialProviderList testingCreds
         = delegatedFS.getS3AInternals().shareCredentials("testing");
         = delegatedFS.getS3AInternals().shareCredentials("testing");
 
 
-    URI landsat = new URI(DEFAULT_CSVTEST_FILE);
+    URI external = requireAnonymousDataPath(getConfiguration()).toUri();
     DefaultS3ClientFactory factory
     DefaultS3ClientFactory factory
         = new DefaultS3ClientFactory();
         = new DefaultS3ClientFactory();
     Configuration conf = delegatedFS.getConf();
     Configuration conf = delegatedFS.getConf();
     factory.setConf(conf);
     factory.setConf(conf);
-    String host = landsat.getHost();
+    String host = external.getHost();
     S3ClientFactory.S3ClientCreationParameters parameters = null;
     S3ClientFactory.S3ClientCreationParameters parameters = null;
     parameters = new S3ClientFactory.S3ClientCreationParameters()
     parameters = new S3ClientFactory.S3ClientCreationParameters()
         .withCredentialSet(testingCreds)
         .withCredentialSet(testingCreds)
@@ -609,7 +610,7 @@ public class ITestSessionDelegationInFilesystem extends AbstractDelegationIT {
             .newStatisticsFromAwsSdk())
             .newStatisticsFromAwsSdk())
         .withUserAgentSuffix("ITestSessionDelegationInFilesystem");
         .withUserAgentSuffix("ITestSessionDelegationInFilesystem");
 
 
-    S3Client s3 = factory.createS3Client(landsat, parameters);
+    S3Client s3 = factory.createS3Client(external, parameters);
 
 
     return Invoker.once("HEAD", host,
     return Invoker.once("HEAD", host,
         () -> s3.headBucket(b -> b.bucket(host)));
         () -> s3.headBucket(b -> b.bucket(host)));

+ 8 - 8
hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/delegation/TestS3ADelegationTokenSupport.java

@@ -24,10 +24,10 @@ import org.junit.BeforeClass;
 import org.junit.Test;
 import org.junit.Test;
 
 
 import org.apache.hadoop.fs.s3a.S3AEncryptionMethods;
 import org.apache.hadoop.fs.s3a.S3AEncryptionMethods;
-import org.apache.hadoop.fs.s3a.S3ATestConstants;
 import org.apache.hadoop.fs.s3a.S3ATestUtils;
 import org.apache.hadoop.fs.s3a.S3ATestUtils;
 import org.apache.hadoop.fs.s3a.auth.MarshalledCredentialBinding;
 import org.apache.hadoop.fs.s3a.auth.MarshalledCredentialBinding;
 import org.apache.hadoop.fs.s3a.auth.MarshalledCredentials;
 import org.apache.hadoop.fs.s3a.auth.MarshalledCredentials;
+import org.apache.hadoop.fs.s3a.test.PublicDatasetTestUtils;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.security.UserGroupInformation;
 import org.apache.hadoop.security.UserGroupInformation;
 import org.apache.hadoop.security.token.SecretManager;
 import org.apache.hadoop.security.token.SecretManager;
@@ -44,11 +44,11 @@ import static org.junit.Assert.assertTrue;
  */
  */
 public class TestS3ADelegationTokenSupport {
 public class TestS3ADelegationTokenSupport {
 
 
-  private static URI landsatUri;
+  private static URI externalUri;
 
 
   @BeforeClass
   @BeforeClass
   public static void classSetup() throws Exception {
   public static void classSetup() throws Exception {
-    landsatUri = new URI(S3ATestConstants.DEFAULT_CSVTEST_FILE);
+    externalUri = new URI(PublicDatasetTestUtils.DEFAULT_EXTERNAL_FILE);
   }
   }
 
 
   @Test
   @Test
@@ -74,7 +74,7 @@ public class TestS3ADelegationTokenSupport {
         = new SessionTokenIdentifier(SESSION_TOKEN_KIND,
         = new SessionTokenIdentifier(SESSION_TOKEN_KIND,
         alice,
         alice,
         renewer,
         renewer,
-        new URI("s3a://landsat-pds/"),
+        new URI("s3a://anything/"),
         new MarshalledCredentials("a", "b", ""),
         new MarshalledCredentials("a", "b", ""),
         new EncryptionSecrets(S3AEncryptionMethods.SSE_S3, ""),
         new EncryptionSecrets(S3AEncryptionMethods.SSE_S3, ""),
         "origin");
         "origin");
@@ -116,7 +116,7 @@ public class TestS3ADelegationTokenSupport {
         SESSION_TOKEN_KIND,
         SESSION_TOKEN_KIND,
         new Text(),
         new Text(),
         renewer,
         renewer,
-        landsatUri,
+        externalUri,
         new MarshalledCredentials("a", "b", "c"),
         new MarshalledCredentials("a", "b", "c"),
         new EncryptionSecrets(), "");
         new EncryptionSecrets(), "");
 
 
@@ -135,7 +135,7 @@ public class TestS3ADelegationTokenSupport {
         SESSION_TOKEN_KIND,
         SESSION_TOKEN_KIND,
         new Text(),
         new Text(),
         null,
         null,
-        landsatUri,
+        externalUri,
         new MarshalledCredentials("a", "b", "c"),
         new MarshalledCredentials("a", "b", "c"),
         new EncryptionSecrets(), "");
         new EncryptionSecrets(), "");
 
 
@@ -151,7 +151,7 @@ public class TestS3ADelegationTokenSupport {
   @Test
   @Test
   public void testRoleTokenIdentifierRoundTrip() throws Throwable {
   public void testRoleTokenIdentifierRoundTrip() throws Throwable {
     RoleTokenIdentifier id = new RoleTokenIdentifier(
     RoleTokenIdentifier id = new RoleTokenIdentifier(
-        landsatUri,
+        externalUri,
         new Text(),
         new Text(),
         new Text(),
         new Text(),
         new MarshalledCredentials("a", "b", "c"),
         new MarshalledCredentials("a", "b", "c"),
@@ -170,7 +170,7 @@ public class TestS3ADelegationTokenSupport {
   public void testFullTokenIdentifierRoundTrip() throws Throwable {
   public void testFullTokenIdentifierRoundTrip() throws Throwable {
     Text renewer = new Text("renewerName");
     Text renewer = new Text("renewerName");
     FullCredentialsTokenIdentifier id = new FullCredentialsTokenIdentifier(
     FullCredentialsTokenIdentifier id = new FullCredentialsTokenIdentifier(
-        landsatUri,
+        externalUri,
         new Text(),
         new Text(),
         renewer,
         renewer,
         new MarshalledCredentials("a", "b", ""),
         new MarshalledCredentials("a", "b", ""),

+ 2 - 1
hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/staging/TestPaths.java

@@ -26,6 +26,7 @@ import org.apache.hadoop.fs.LocalFileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.test.HadoopTestBase;
 import org.apache.hadoop.test.HadoopTestBase;
 
 
+import static org.apache.hadoop.fs.s3a.S3ATestConstants.UNIT_TEST_EXAMPLE_PATH;
 import static org.apache.hadoop.fs.s3a.commit.staging.Paths.*;
 import static org.apache.hadoop.fs.s3a.commit.staging.Paths.*;
 import static org.apache.hadoop.test.LambdaTestUtils.intercept;
 import static org.apache.hadoop.test.LambdaTestUtils.intercept;
 
 
@@ -81,7 +82,7 @@ public class TestPaths extends HadoopTestBase {
     assertEquals("from " + path, expected, addUUID(path, "UUID"));
     assertEquals("from " + path, expected, addUUID(path, "UUID"));
   }
   }
 
 
-  private static final String DATA = "s3a://landsat-pds/data/";
+  private static final String DATA = UNIT_TEST_EXAMPLE_PATH;
   private static final Path BASE = new Path(DATA);
   private static final Path BASE = new Path(DATA);
 
 
   @Test
   @Test

+ 20 - 22
hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/ITestS3GuardTool.java

@@ -22,14 +22,17 @@ import java.io.BufferedReader;
 import java.io.ByteArrayInputStream;
 import java.io.ByteArrayInputStream;
 import java.io.ByteArrayOutputStream;
 import java.io.ByteArrayOutputStream;
 import java.io.InputStreamReader;
 import java.io.InputStreamReader;
+import java.net.URI;
 import java.util.ArrayList;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Arrays;
 import java.util.List;
 import java.util.List;
 
 
 import org.junit.Test;
 import org.junit.Test;
 
 
+import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.s3a.S3AFileSystem;
 import org.apache.hadoop.fs.s3a.S3AFileSystem;
+import org.apache.hadoop.fs.s3a.test.PublicDatasetTestUtils;
 import org.apache.hadoop.test.LambdaTestUtils;
 import org.apache.hadoop.test.LambdaTestUtils;
 import org.apache.hadoop.util.StringUtils;
 import org.apache.hadoop.util.StringUtils;
 
 
@@ -40,7 +43,6 @@ import static org.apache.hadoop.fs.s3a.MultipartTestUtils.assertNoUploadsAt;
 import static org.apache.hadoop.fs.s3a.MultipartTestUtils.clearAnyUploads;
 import static org.apache.hadoop.fs.s3a.MultipartTestUtils.clearAnyUploads;
 import static org.apache.hadoop.fs.s3a.MultipartTestUtils.countUploadsAt;
 import static org.apache.hadoop.fs.s3a.MultipartTestUtils.countUploadsAt;
 import static org.apache.hadoop.fs.s3a.MultipartTestUtils.createPartUpload;
 import static org.apache.hadoop.fs.s3a.MultipartTestUtils.createPartUpload;
-import static org.apache.hadoop.fs.s3a.S3ATestUtils.getLandsatCSVFile;
 import static org.apache.hadoop.fs.s3a.S3ATestUtils.removeBaseAndBucketOverrides;
 import static org.apache.hadoop.fs.s3a.S3ATestUtils.removeBaseAndBucketOverrides;
 import static org.apache.hadoop.fs.s3a.s3guard.S3GuardTool.BucketInfo;
 import static org.apache.hadoop.fs.s3a.s3guard.S3GuardTool.BucketInfo;
 import static org.apache.hadoop.fs.s3a.s3guard.S3GuardTool.E_BAD_STATE;
 import static org.apache.hadoop.fs.s3a.s3guard.S3GuardTool.E_BAD_STATE;
@@ -57,36 +59,32 @@ public class ITestS3GuardTool extends AbstractS3GuardToolTestBase {
       "-force", "-verbose"};
       "-force", "-verbose"};
 
 
   @Test
   @Test
-  public void testLandsatBucketUnguarded() throws Throwable {
-    run(BucketInfo.NAME,
-        "-" + BucketInfo.UNGUARDED_FLAG,
-        getLandsatCSVFile(getConfiguration()));
-  }
-
-  @Test
-  public void testLandsatBucketRequireGuarded() throws Throwable {
-    runToFailure(E_BAD_STATE,
-        BucketInfo.NAME,
-        "-" + BucketInfo.GUARDED_FLAG,
-        getLandsatCSVFile(
-            ITestS3GuardTool.this.getConfiguration()));
-  }
-
-  @Test
-  public void testLandsatBucketRequireUnencrypted() throws Throwable {
+  public void testExternalBucketRequireUnencrypted() throws Throwable {
     removeBaseAndBucketOverrides(getConfiguration(), S3_ENCRYPTION_ALGORITHM);
     removeBaseAndBucketOverrides(getConfiguration(), S3_ENCRYPTION_ALGORITHM);
     run(BucketInfo.NAME,
     run(BucketInfo.NAME,
         "-" + BucketInfo.ENCRYPTION_FLAG, "none",
         "-" + BucketInfo.ENCRYPTION_FLAG, "none",
-        getLandsatCSVFile(getConfiguration()));
+        externalBucket());
+  }
+
+  /**
+   * Get the external bucket; this is of the default external file.
+   * If not set to the default value, the test will be skipped.
+   * @return the bucket of the default external file.
+   */
+  private String externalBucket() {
+    Configuration conf = getConfiguration();
+    Path result = PublicDatasetTestUtils.requireDefaultExternalData(conf);
+    final URI uri = result.toUri();
+    final String bucket = uri.getScheme() + "://" + uri.getHost();
+    return bucket;
   }
   }
 
 
   @Test
   @Test
-  public void testLandsatBucketRequireEncrypted() throws Throwable {
+  public void testExternalBucketRequireEncrypted() throws Throwable {
     runToFailure(E_BAD_STATE,
     runToFailure(E_BAD_STATE,
         BucketInfo.NAME,
         BucketInfo.NAME,
         "-" + BucketInfo.ENCRYPTION_FLAG,
         "-" + BucketInfo.ENCRYPTION_FLAG,
-        "AES256", getLandsatCSVFile(
-            ITestS3GuardTool.this.getConfiguration()));
+        "AES256", externalBucket());
   }
   }
 
 
   @Test
   @Test

+ 3 - 2
hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/TestAuthoritativePath.java

@@ -33,6 +33,7 @@ import org.apache.hadoop.fs.s3a.S3AUtils;
 import org.apache.hadoop.test.AbstractHadoopTestBase;
 import org.apache.hadoop.test.AbstractHadoopTestBase;
 
 
 import static org.apache.hadoop.fs.s3a.Constants.AUTHORITATIVE_PATH;
 import static org.apache.hadoop.fs.s3a.Constants.AUTHORITATIVE_PATH;
+import static org.apache.hadoop.fs.s3a.S3ATestConstants.UNIT_TEST_EXAMPLE_PATH;
 import static org.assertj.core.api.Assertions.assertThat;
 import static org.assertj.core.api.Assertions.assertThat;
 
 
 /**
 /**
@@ -71,7 +72,7 @@ public class TestAuthoritativePath extends AbstractHadoopTestBase {
   @Test
   @Test
   public void testOtherBucket() throws Throwable {
   public void testOtherBucket() throws Throwable {
     assertAuthPaths(l("/one/",
     assertAuthPaths(l("/one/",
-        "s3a://landsat-pds/",
+        UNIT_TEST_EXAMPLE_PATH,
         BASE + "/two/"),
         BASE + "/two/"),
         "/one/", "/two/");
         "/one/", "/two/");
   }
   }
@@ -79,7 +80,7 @@ public class TestAuthoritativePath extends AbstractHadoopTestBase {
   @Test
   @Test
   public void testOtherScheme() throws Throwable {
   public void testOtherScheme() throws Throwable {
     assertAuthPaths(l("/one/",
     assertAuthPaths(l("/one/",
-        "s3a://landsat-pds/",
+         UNIT_TEST_EXAMPLE_PATH,
         "http://bucket/two/"),
         "http://bucket/two/"),
         "/one/");
         "/one/");
   }
   }

+ 15 - 3
hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/scale/ITestS3AInputStreamPerformance.java

@@ -30,6 +30,7 @@ import org.apache.hadoop.fs.s3a.S3AInputPolicy;
 import org.apache.hadoop.fs.s3a.S3AInputStream;
 import org.apache.hadoop.fs.s3a.S3AInputStream;
 import org.apache.hadoop.fs.s3a.S3ATestUtils;
 import org.apache.hadoop.fs.s3a.S3ATestUtils;
 import org.apache.hadoop.fs.s3a.statistics.S3AInputStreamStatistics;
 import org.apache.hadoop.fs.s3a.statistics.S3AInputStreamStatistics;
+import org.apache.hadoop.fs.s3a.test.PublicDatasetTestUtils;
 import org.apache.hadoop.fs.statistics.IOStatistics;
 import org.apache.hadoop.fs.statistics.IOStatistics;
 import org.apache.hadoop.fs.statistics.IOStatisticsSnapshot;
 import org.apache.hadoop.fs.statistics.IOStatisticsSnapshot;
 import org.apache.hadoop.fs.statistics.MeanStatistic;
 import org.apache.hadoop.fs.statistics.MeanStatistic;
@@ -112,7 +113,9 @@ public class ITestS3AInputStreamPerformance extends S3AScaleTestBase {
     Configuration conf = getConf();
     Configuration conf = getConf();
     conf.setInt(SOCKET_SEND_BUFFER, 16 * 1024);
     conf.setInt(SOCKET_SEND_BUFFER, 16 * 1024);
     conf.setInt(SOCKET_RECV_BUFFER, 16 * 1024);
     conf.setInt(SOCKET_RECV_BUFFER, 16 * 1024);
-    String testFile =  conf.getTrimmed(KEY_CSVTEST_FILE, DEFAULT_CSVTEST_FILE);
+    // look up the test file, no requirement to be set.
+    String testFile =  conf.getTrimmed(KEY_CSVTEST_FILE,
+        PublicDatasetTestUtils.DEFAULT_EXTERNAL_FILE);
     if (testFile.isEmpty()) {
     if (testFile.isEmpty()) {
       assumptionMessage = "Empty test property: " + KEY_CSVTEST_FILE;
       assumptionMessage = "Empty test property: " + KEY_CSVTEST_FILE;
       LOG.warn(assumptionMessage);
       LOG.warn(assumptionMessage);
@@ -394,6 +397,9 @@ public class ITestS3AInputStreamPerformance extends S3AScaleTestBase {
     CompressionCodecFactory factory
     CompressionCodecFactory factory
         = new CompressionCodecFactory(getConf());
         = new CompressionCodecFactory(getConf());
     CompressionCodec codec = factory.getCodec(testData);
     CompressionCodec codec = factory.getCodec(testData);
+    Assertions.assertThat(codec)
+        .describedAs("No codec found for %s", testData)
+        .isNotNull();
     long bytesRead = 0;
     long bytesRead = 0;
     int lines = 0;
     int lines = 0;
 
 
@@ -525,12 +531,18 @@ public class ITestS3AInputStreamPerformance extends S3AScaleTestBase {
     describe("Random IO with policy \"%s\"", policy);
     describe("Random IO with policy \"%s\"", policy);
     byte[] buffer = new byte[_1MB];
     byte[] buffer = new byte[_1MB];
     long totalBytesRead = 0;
     long totalBytesRead = 0;
-
+    final long len = testDataStatus.getLen();
     in = openTestFile(policy, 0);
     in = openTestFile(policy, 0);
     ContractTestUtils.NanoTimer timer = new ContractTestUtils.NanoTimer();
     ContractTestUtils.NanoTimer timer = new ContractTestUtils.NanoTimer();
     for (int[] action : RANDOM_IO_SEQUENCE) {
     for (int[] action : RANDOM_IO_SEQUENCE) {
-      int position = action[0];
+      long position = action[0];
       int range = action[1];
       int range = action[1];
+      // if a read goes past EOF, fail with details
+      // this will happen if the test datafile is too small.
+      Assertions.assertThat(position + range)
+          .describedAs("readFully(pos=%d range=%d) of %s",
+              position, range, testDataStatus)
+          .isLessThanOrEqualTo(len);
       in.readFully(position, buffer, 0, range);
       in.readFully(position, buffer, 0, range);
       totalBytesRead += range;
       totalBytesRead += range;
     }
     }

+ 14 - 45
hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/statistics/ITestAWSStatisticCollection.java

@@ -22,61 +22,30 @@ import org.junit.Test;
 
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.fs.s3a.AbstractS3ATestBase;
-import org.apache.hadoop.fs.s3a.S3AFileSystem;
-import org.apache.hadoop.fs.statistics.IOStatistics;
+import org.apache.hadoop.fs.s3a.performance.AbstractS3ACostTest;
 
 
-import static org.apache.hadoop.fs.s3a.Constants.DEFAULT_ENDPOINT;
-import static org.apache.hadoop.fs.s3a.Constants.ENDPOINT;
-import static org.apache.hadoop.fs.s3a.S3ATestUtils.getLandsatCSVPath;
+import static org.apache.hadoop.fs.s3a.Constants.FS_S3A_CREATE_PERFORMANCE;
 import static org.apache.hadoop.fs.s3a.Statistic.STORE_IO_REQUEST;
 import static org.apache.hadoop.fs.s3a.Statistic.STORE_IO_REQUEST;
-import static org.apache.hadoop.fs.statistics.IOStatisticAssertions.assertThatStatisticCounter;
 
 
 /**
 /**
  * Verify that AWS SDK statistics are wired up.
  * Verify that AWS SDK statistics are wired up.
- * This test tries to read data from US-east-1 and us-west-2 buckets
- * so as to be confident that the nuances of region mapping
- * are handed correctly (HADOOP-13551).
- * The statistics are probed to verify that the wiring up is complete.
  */
  */
-public class ITestAWSStatisticCollection extends AbstractS3ATestBase {
+public class ITestAWSStatisticCollection extends AbstractS3ACostTest {
 
 
-  private static final Path COMMON_CRAWL_PATH
-      = new Path("s3a://osm-pds/planet/planet-latest.orc");
-
-  @Test
-  public void testLandsatStatistics() throws Throwable {
-    final Configuration conf = getConfiguration();
-    // skips the tests if the landsat path isn't the default.
-    Path path = getLandsatCSVPath(conf);
-    conf.set(ENDPOINT, DEFAULT_ENDPOINT);
-    conf.unset("fs.s3a.bucket.landsat-pds.endpoint");
-
-    try (S3AFileSystem fs = (S3AFileSystem) path.getFileSystem(conf)) {
-      fs.getS3AInternals().getObjectMetadata(path);
-      IOStatistics iostats = fs.getIOStatistics();
-      assertThatStatisticCounter(iostats,
-          STORE_IO_REQUEST.getSymbol())
-          .isGreaterThanOrEqualTo(1);
-    }
+  @Override
+  public Configuration createConfiguration() {
+    final Configuration conf = super.createConfiguration();
+    conf.setBoolean(FS_S3A_CREATE_PERFORMANCE, true);
+    return conf;
   }
   }
 
 
   @Test
   @Test
-  public void testCommonCrawlStatistics() throws Throwable {
-    final Configuration conf = getConfiguration();
-    // skips the tests if the landsat path isn't the default.
-    getLandsatCSVPath(conf);
-
-    Path path = COMMON_CRAWL_PATH;
-    conf.set(ENDPOINT, DEFAULT_ENDPOINT);
-
-    try (S3AFileSystem fs = (S3AFileSystem) path.getFileSystem(conf)) {
-      fs.getS3AInternals().getObjectMetadata(path);
-      IOStatistics iostats = fs.getIOStatistics();
-      assertThatStatisticCounter(iostats,
-          STORE_IO_REQUEST.getSymbol())
-          .isGreaterThanOrEqualTo(1);
-    }
+  public void testSDKMetricsCostOfGetFileStatusOnFile() throws Throwable {
+    describe("performing getFileStatus on a file");
+    Path simpleFile = file(methodPath());
+    // and repeat on the file looking at AWS wired up stats
+    verifyMetrics(() -> getFileSystem().getFileStatus(simpleFile),
+        with(STORE_IO_REQUEST, 1));
   }
   }
 
 
 }
 }

+ 82 - 0
hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/test/PublicDatasetTestUtils.java

@@ -18,9 +18,13 @@
 
 
 package org.apache.hadoop.fs.s3a.test;
 package org.apache.hadoop.fs.s3a.test;
 
 
+import org.junit.Assume;
+
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.classification.InterfaceStability;
 import org.apache.hadoop.classification.InterfaceStability;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.s3a.S3ATestConstants;
 import org.apache.hadoop.fs.s3a.S3ATestUtils;
 import org.apache.hadoop.fs.s3a.S3ATestUtils;
 
 
 import static org.apache.hadoop.fs.s3a.S3ATestConstants.KEY_BUCKET_WITH_MANY_OBJECTS;
 import static org.apache.hadoop.fs.s3a.S3ATestConstants.KEY_BUCKET_WITH_MANY_OBJECTS;
@@ -69,6 +73,77 @@ public final class PublicDatasetTestUtils {
   private static final String DEFAULT_BUCKET_WITH_MANY_OBJECTS
   private static final String DEFAULT_BUCKET_WITH_MANY_OBJECTS
       = "s3a://usgs-landsat/collection02/level-1/";
       = "s3a://usgs-landsat/collection02/level-1/";
 
 
+  /**
+   * ORC dataset: {@value}.
+   */
+  private static final Path ORC_DATA = new Path("s3a://osm-pds/planet/planet-latest.orc");
+
+  /**
+   * Provide a Path for some ORC data.
+   *
+   * @param conf Hadoop configuration
+   * @return S3A FS URI
+   */
+  public static Path getOrcData(Configuration conf) {
+    return ORC_DATA;
+  }
+
+  /**
+   * Default path for the external test file: {@value}.
+   * This must be: gzipped, large enough for the performance
+   * tests and in a read-only bucket with anonymous access.
+   * */
+  public static final String DEFAULT_EXTERNAL_FILE =
+      "s3a://noaa-cors-pds/raw/2023/017/ohfh/OHFH017d.23_.gz";
+
+  /**
+   * Get the external test file.
+   * <p>
+   * This must be: gzipped, large enough for the performance
+   * tests and in a read-only bucket with anon
+   * @param conf configuration
+   * @return a dataset which meets the requirements.
+   */
+  public static Path getExternalData(Configuration conf) {
+    return new Path(fetchFromConfig(conf,
+        S3ATestConstants.KEY_CSVTEST_FILE, DEFAULT_EXTERNAL_FILE));
+  }
+
+  /**
+   * Get the anonymous dataset..
+   * @param conf configuration
+   * @return a dataset which supports anonymous access.
+   */
+  public static Path requireAnonymousDataPath(Configuration conf) {
+    return requireDefaultExternalData(conf);
+  }
+
+
+  /**
+   * Get the external test file; assume() that it is not modified (i.e. we haven't
+   * switched to a new storage infrastructure where the bucket is no longer
+   * read only).
+   * @return test file.
+   * @param conf test configuration
+   */
+  public static String requireDefaultExternalDataFile(Configuration conf) {
+    String filename = getExternalData(conf).toUri().toString();
+    Assume.assumeTrue("External test file is not the default",
+        DEFAULT_EXTERNAL_FILE.equals(filename));
+    return filename;
+  }
+
+  /**
+   * Get the test external file; assume() that it is not modified (i.e. we haven't
+   * switched to a new storage infrastructure where the bucket is no longer
+   * read only).
+   * @param conf test configuration
+   * @return test file as a path.
+   */
+  public static Path requireDefaultExternalData(Configuration conf) {
+    return new Path(requireDefaultExternalDataFile(conf));
+  }
+
   /**
   /**
    * Provide a URI for a directory containing many objects.
    * Provide a URI for a directory containing many objects.
    *
    *
@@ -97,6 +172,13 @@ public final class PublicDatasetTestUtils {
         KEY_REQUESTER_PAYS_FILE, DEFAULT_REQUESTER_PAYS_FILE);
         KEY_REQUESTER_PAYS_FILE, DEFAULT_REQUESTER_PAYS_FILE);
   }
   }
 
 
+  /**
+   * Fetch a trimmed configuration value, require it to to be non-empty.
+   * @param conf configuration file
+   * @param key key
+   * @param defaultValue default value.
+   * @return the resolved value.
+   */
   private static String fetchFromConfig(Configuration conf, String key, String defaultValue) {
   private static String fetchFromConfig(Configuration conf, String key, String defaultValue) {
     String value = conf.getTrimmed(key, defaultValue);
     String value = conf.getTrimmed(key, defaultValue);
 
 

+ 30 - 10
hadoop-tools/hadoop-aws/src/test/resources/core-site.xml

@@ -30,37 +30,57 @@
     <final>false</final>
     <final>false</final>
   </property>
   </property>
 
 
-  <!-- Per-bucket configurations: landsat-pds -->
   <!--
   <!--
+    Test file for some scale tests.
+
     A CSV file in this bucket was used for testing S3 select.
     A CSV file in this bucket was used for testing S3 select.
     Although this feature has been removed, (HADOOP-18830)
     Although this feature has been removed, (HADOOP-18830)
     it is still used in some tests as a large file to read
     it is still used in some tests as a large file to read
-    in a bucket without write permissions.
-    These tests do not need a CSV file.
+    and as a file in a bucket without write permissions.
+    The original file s3a://landsat-pds/scene_list.gz is
+    on a now-inaccessible bucket.
   -->
   -->
+<!--
+  This is defined in PublicDatasetTestUtils;
+  if needed for older builds, this can copied into
+  auth-keys along with the other bucket binding information,
+  which is all exclusively defined here.
+
   <property>
   <property>
-    <name>fs.s3a.bucket.landsat-pds.endpoint.region</name>
-    <value>us-west-2</value>
-    <description>The region for s3a://landsat-pds</description>
+    <name>fs.s3a.scale.test.csvfile</name>
+    <value>s3a://noaa-cors-pds/raw/2024/001/akse/AKSE001x.24_.gz</value>
+    <description>file used in scale tests</description>
   </property>
   </property>
+-->
 
 
   <property>
   <property>
-    <name>fs.s3a.bucket.landsat-pds.multipart.purge</name>
+    <name>fs.s3a.bucket.noaa-cors-pds.endpoint.region</name>
+    <value>us-east-1</value>
+  </property>
+
+  <property>
+    <name>fs.s3a.bucket.noaa-isd-pds.multipart.purge</name>
     <value>false</value>
     <value>false</value>
     <description>Don't try to purge uploads in the read-only bucket, as
     <description>Don't try to purge uploads in the read-only bucket, as
     it will only create log noise.</description>
     it will only create log noise.</description>
   </property>
   </property>
 
 
   <property>
   <property>
-    <name>fs.s3a.bucket.landsat-pds.probe</name>
+    <name>fs.s3a.bucket.noaa-isd-pds.probe</name>
     <value>0</value>
     <value>0</value>
     <description>Let's postpone existence checks to the first IO operation </description>
     <description>Let's postpone existence checks to the first IO operation </description>
   </property>
   </property>
 
 
   <property>
   <property>
-    <name>fs.s3a.bucket.landsat-pds.audit.add.referrer.header</name>
+    <name>fs.s3a.bucket.noaa-isd-pds.audit.add.referrer.header</name>
     <value>false</value>
     <value>false</value>
-    <description>Do not add the referrer header to landsat operations</description>
+    <description>Do not add the referrer header</description>
+  </property>
+
+  <property>
+    <name>fs.s3a.bucket.noaa-isd-pds.prefetch.block.size</name>
+    <value>128k</value>
+    <description>Use a small prefetch size so tests fetch multiple blocks</description>
   </property>
   </property>
 
 
   <!-- Per-bucket configurations: usgs-landsat -->
   <!-- Per-bucket configurations: usgs-landsat -->