Browse Source

HADOOP-16465 listLocatedStatus() optimisation (#1943)

Contributed by Mukund Thakur

Optimize S3AFileSystem.listLocatedStatus() to perform list
operations directly and then fallback to head checks for files
Mukund Thakur 5 năm trước cách đây
mục cha
commit
7b2d84d19c

+ 57 - 30
hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java

@@ -4283,41 +4283,68 @@ public class S3AFileSystem extends FileSystem implements StreamCapabilities,
     RemoteIterator<? extends LocatedFileStatus> iterator =
         once("listLocatedStatus", path.toString(),
           () -> {
-            // lookup dir triggers existence check
-            final S3AFileStatus fileStatus =
-                (S3AFileStatus) getFileStatus(path);
-            if (fileStatus.isFile()) {
-              // simple case: File
-              LOG.debug("Path is a file");
-              return new Listing.SingleStatusRemoteIterator(
-                  filter.accept(path) ? toLocatedFileStatus(fileStatus) : null);
-            } else {
-              // directory: trigger a lookup
-              final String key = maybeAddTrailingSlash(pathToKey(path));
-              final Listing.FileStatusAcceptor acceptor =
-                  new Listing.AcceptAllButSelfAndS3nDirs(path);
-              boolean allowAuthoritative = allowAuthoritative(f);
-              DirListingMetadata meta =
-                  S3Guard.listChildrenWithTtl(metadataStore, path,
-                      ttlTimeProvider, allowAuthoritative);
-              final RemoteIterator<S3AFileStatus> cachedFileStatusIterator =
-                  listing.createProvidedFileStatusIterator(
-                      S3Guard.dirMetaToStatuses(meta), filter, acceptor);
-              return (allowAuthoritative && meta != null
-                  && meta.isAuthoritative())
-                  ? listing.createLocatedFileStatusIterator(
-                  cachedFileStatusIterator)
-                  : listing.createLocatedFileStatusIterator(
-                      listing.createFileStatusListingIterator(path,
-                          createListObjectsRequest(key, "/"),
-                          filter,
-                          acceptor,
-                          cachedFileStatusIterator));
+            // Assuming the path to be a directory,
+            // trigger a list call directly.
+            final RemoteIterator<S3ALocatedFileStatus>
+                    locatedFileStatusIteratorForDir =
+                    getLocatedFileStatusIteratorForDir(path, filter);
+
+            // If no listing is present then path might be a file.
+            if (!locatedFileStatusIteratorForDir.hasNext()) {
+              final S3AFileStatus fileStatus =
+                      (S3AFileStatus) getFileStatus(path);
+              if (fileStatus.isFile()) {
+                // simple case: File
+                LOG.debug("Path is a file");
+                return new Listing.SingleStatusRemoteIterator(
+                        filter.accept(path)
+                                ? toLocatedFileStatus(fileStatus)
+                                : null);
+              }
             }
+            // Either empty or non-empty directory.
+            return locatedFileStatusIteratorForDir;
           });
     return toLocatedFileStatusIterator(iterator);
   }
 
+  /**
+   * Generate list located status for a directory.
+   * Also performing tombstone reconciliation for guarded directories.
+   * @param dir directory to check.
+   * @param filter a path filter.
+   * @return an iterator that traverses statuses of the given dir.
+   * @throws IOException in case of failure.
+   */
+  private RemoteIterator<S3ALocatedFileStatus> getLocatedFileStatusIteratorForDir(
+          Path dir, PathFilter filter) throws IOException {
+    final String key = maybeAddTrailingSlash(pathToKey(dir));
+    final Listing.FileStatusAcceptor acceptor =
+        new Listing.AcceptAllButSelfAndS3nDirs(dir);
+    boolean allowAuthoritative = allowAuthoritative(dir);
+    DirListingMetadata meta =
+        S3Guard.listChildrenWithTtl(metadataStore, dir,
+            ttlTimeProvider, allowAuthoritative);
+    Set<Path> tombstones = meta != null
+            ? meta.listTombstones()
+            : null;
+    final RemoteIterator<S3AFileStatus> cachedFileStatusIterator =
+        listing.createProvidedFileStatusIterator(
+            S3Guard.dirMetaToStatuses(meta), filter, acceptor);
+    return (allowAuthoritative && meta != null
+        && meta.isAuthoritative())
+        ? listing.createLocatedFileStatusIterator(
+        cachedFileStatusIterator)
+        : listing.createTombstoneReconcilingIterator(
+            listing.createLocatedFileStatusIterator(
+            listing.createFileStatusListingIterator(dir,
+                createListObjectsRequest(key, "/"),
+                filter,
+                acceptor,
+                cachedFileStatusIterator)),
+            tombstones);
+  }
+
   /**
    * Build a {@link S3ALocatedFileStatus} from a {@link FileStatus} instance.
    * @param status file status

+ 57 - 0
hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AFileOperationCost.java

@@ -111,6 +111,63 @@ public class ITestS3AFileOperationCost extends AbstractS3ATestBase {
     skipDuringFaultInjection(fs);
   }
 
+  @Test
+  public void testCostOfLocatedFileStatusOnFile() throws Throwable {
+    describe("performing listLocatedStatus on a file");
+    Path file = path(getMethodName() + ".txt");
+    S3AFileSystem fs = getFileSystem();
+    touch(fs, file);
+    resetMetricDiffs();
+    fs.listLocatedStatus(file);
+    if (!fs.hasMetadataStore()) {
+      // Unguarded FS.
+      metadataRequests.assertDiffEquals(1);
+    }
+    listRequests.assertDiffEquals(1);
+  }
+
+  @Test
+  public void testCostOfListLocatedStatusOnEmptyDir() throws Throwable {
+    describe("performing listLocatedStatus on an empty dir");
+    Path dir = path(getMethodName());
+    S3AFileSystem fs = getFileSystem();
+    fs.mkdirs(dir);
+    resetMetricDiffs();
+    fs.listLocatedStatus(dir);
+    if (!fs.hasMetadataStore()) {
+      // Unguarded FS.
+      verifyOperationCount(2, 1);
+    } else {
+      if (fs.allowAuthoritative(dir)) {
+        verifyOperationCount(0, 0);
+      } else {
+        verifyOperationCount(0, 1);
+      }
+    }
+  }
+
+  @Test
+  public void testCostOfListLocatedStatusOnNonEmptyDir() throws Throwable {
+    describe("performing listLocatedStatus on a non empty dir");
+    Path dir = path(getMethodName() + "dir");
+    S3AFileSystem fs = getFileSystem();
+    fs.mkdirs(dir);
+    Path file = new Path(dir, "file.txt");
+    touch(fs, file);
+    resetMetricDiffs();
+    fs.listLocatedStatus(dir);
+    if (!fs.hasMetadataStore()) {
+      // Unguarded FS.
+      verifyOperationCount(0, 1);
+    } else {
+      if(fs.allowAuthoritative(dir)) {
+        verifyOperationCount(0, 0);
+      } else {
+        verifyOperationCount(0, 1);
+      }
+    }
+  }
+
   @Test
   public void testCostOfGetFileStatusOnFile() throws Throwable {
     describe("performing getFileStatus on a file");