소스 검색

HADOOP-18056. DistCp: Filter duplicates in the source paths. (#3825). Contributed by Ayush Saxena.

Reviewed-by: tomscut <litao@bigo.sg>
Reviewed-by: Steve Loughran <stevel@apache.org>
Ayush Saxena 3 년 전
부모
커밋
46b1411189

+ 14 - 1
hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpOptions.java

@@ -30,8 +30,10 @@ import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.tools.util.DistCpUtils;
 
+import java.util.ArrayList;
 import java.util.Collections;
 import java.util.EnumSet;
+import java.util.LinkedHashSet;
 import java.util.List;
 import java.util.NoSuchElementException;
 import java.util.Set;
@@ -233,7 +235,18 @@ public final class DistCpOptions {
 
   public List<Path> getSourcePaths() {
     return sourcePaths == null ?
-        null : Collections.unmodifiableList(sourcePaths);
+        null :
+        Collections.unmodifiableList(getUniquePaths(sourcePaths));
+  }
+
+  private List<Path> getUniquePaths(List<Path> srcPaths) {
+    Set<Path> uniquePaths = new LinkedHashSet<>();
+    for (Path path : srcPaths) {
+      if (!uniquePaths.add(path)) {
+        LOG.info("Path: {} added multiple times, ignoring the redundant entry.", path);
+      }
+    }
+    return new ArrayList<>(uniquePaths);
   }
 
   public Path getTargetPath() {

+ 23 - 0
hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/TestCopyListing.java

@@ -167,6 +167,29 @@ public class TestCopyListing extends SimpleCopyListing {
     }
   }
 
+  @Test
+  public void testDuplicateSourcePaths() throws Exception {
+    FileSystem fs = FileSystem.get(getConf());
+    List<Path> srcPaths = new ArrayList<Path>();
+    try {
+      srcPaths.add(new Path("/tmp/in"));
+      srcPaths.add(new Path("/tmp/in"));
+      TestDistCpUtils.createFile(fs, "/tmp/in/src1/1.txt");
+      TestDistCpUtils.createFile(fs, "/tmp/in/src2/1.txt");
+      Path target = new Path("/tmp/out");
+      Path listingFile = new Path("/tmp/list");
+      final DistCpOptions options =
+          new DistCpOptions.Builder(srcPaths, target).build();
+      final DistCpContext context = new DistCpContext(options);
+      CopyListing listing =
+          CopyListing.getCopyListing(getConf(), CREDENTIALS, context);
+      listing.buildListing(listingFile, context);
+      Assert.assertTrue(fs.exists(listingFile));
+    } finally {
+      TestDistCpUtils.delete(fs, "/tmp");
+    }
+  }
+
   @Test(timeout=10000)
   public void testBuildListing() {
     FileSystem fs = null;